Use non-regex approach for VS16 adjustments

janlelis · Nov 18, 2024 · 169ad17 · 169ad17
1 parent 82b17bd
commit 169ad17
Show file tree

Hide file tree

Showing 2 changed files with 99 additions and 47 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,9 @@
 # CHANGELOG
 
+## 3.1.1 (unreleased)
+
+- Performance improvements
+
 ## 3.1.0
 
 **Improve Emoji support:**

diff --git a/lib/unicode/display_width.rb b/lib/unicode/display_width.rb
@@ -8,8 +8,17 @@
 
 module Unicode
   class DisplayWidth
-    DEFAULT_AMBIGUOUS = 1
     INITIAL_DEPTH = 0x10000
+    def self.width_in_index(codepoint, index)
+      d = INITIAL_DEPTH
+      w = index[codepoint / d]
+      while w.instance_of? Array
+        w = w[(codepoint %= d) / (d /= 16)]
+      end
+      w || 1
+    end
+
+    DEFAULT_AMBIGUOUS = 1
     ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
     ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
     ASCII_BACKSPACE = "\b"
@@ -25,11 +34,19 @@ class DisplayWidth
       WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
       WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
     }
+    VS16_TEXT_CODEPOINTS = {
+      WIDTH_ONE: Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT,
+      WIDTH_TWO: (Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT).reject{ |codepoint|
+        width_in_index(codepoint, INDEX[:WIDTH_TWO]) == 2
+      },
+    }
     EMOJI_SEQUENCES_REGEX_MAPPING = {
       rgi: :REGEX_INCLUDE_MQE_UQE,
       rgi_at: :REGEX_INCLUDE_MQE_UQE,
       possible: :REGEX_WELL_FORMED,
     }
+    EMOJI_NON_VS16_OPTIONS = [:all_no_vs16, :rgi_at, :none, false]
+    VS16 = 0xFE0F
     REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
     REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
     REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
@@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option
       # # #
 
       if !options[:overwrite].empty?
-        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
-          width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
+        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
+          width_all_features(
+            string,
+            index_full,
+            index_low,
+            first_ambiguous,
+            options[:overwrite],
+            EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) ? nil : vs16_text_codepoints
+          )
         end
       end
 
       if !string.ascii_only?
-        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
-          width_no_overwrite(string, index_full, index_low, first_ambiguous)
+        return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
+          if EMOJI_NON_VS16_OPTIONS.include?(options[:emoji])
+            width_no_overwrite(string, index_full, index_low, first_ambiguous)
+          else
+            width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
+          end
         end
       end
 
@@ -102,7 +130,13 @@ def self.width_frame(string, options)
       ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]
 
       # Get general width
-      res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
+      res += yield(
+        string,
+        INDEX[ambiguous_index_name],
+        FIRST_4096[ambiguous_index_name],
+        FIRST_AMBIGUOUS[ambiguous_index_name],
+        VS16_TEXT_CODEPOINTS[ambiguous_index_name]
+      )
 
       # Return result + prevent negative lengths
       res < 0 ? 0 : res
@@ -139,26 +173,73 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ =
       res
     end
 
+    def self.width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
+      res = 0
+
+      # Make sure we have UTF-8
+      string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"
+
+      # Track last codepoint and apply VS16 adjustment if necassary
+      last_codepoint = nil
+
+      string.scan(/.{,80}/m){ |batch|
+        if batch.ascii_only?
+          res += batch.size
+        else
+          batch.each_codepoint{ |codepoint|
+            if codepoint > 15 && codepoint < first_ambiguous
+              res += 1
+            elsif codepoint < 0x1001
+              res += index_low[codepoint] || 1
+            elsif codepoint == VS16 && vs16_text_codepoints.include?(last_codepoint)
+              res += 1
+            else
+              d = INITIAL_DEPTH
+              c = codepoint
+              w = index_full[c / d]
+              while w.instance_of? Array
+                w = w[(c %= d) / (d /= 16)]
+              end
+
+              res += w || 1
+            end
+
+            last_codepoint = codepoint
+          }
+        end
+      }
+
+      res
+    end
+
     # Same as .width_no_overwrite - but with applying overwrites for each char
-    def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
+    def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite, vs16_text_codepoints)
       res = 0
 
+      # Track last codepoint and apply VS16 adjustment if necassary
+      last_codepoint = nil
+
       string.each_codepoint{ |codepoint|
         if overwrite[codepoint]
           res += overwrite[codepoint]
         elsif codepoint > 15 && codepoint < first_ambiguous
           res += 1
         elsif codepoint < 0x1001
           res += index_low[codepoint] || 1
+        elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints.include?(last_codepoint)
+          res += 1
         else
           d = INITIAL_DEPTH
-          w = index_full[codepoint / d]
+          c = codepoint
+          w = index_full[c / d]
           while w.instance_of? Array
-            w = w[(codepoint %= d) / (d /= 16)]
+            w = w[(c %= d) / (d /= 16)]
           end
 
           res += w || 1
         end
+
+        last_codepoint = codepoint
       }
 
       res
@@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
           mode == :rgi_at,
           ambiguous,
         )
-      elsif mode == :all_no_vs16
+      elsif mode == :all_no_vs16 || mode == :all
         emoji_width_all(string)
-      elsif mode == :vs16
-        emoji_width_basic(string)
-      elsif mode == :all
-        res_all, string = emoji_width_all(string)
-        res_basic, string = emoji_width_basic(string)
-        [res_all + res_basic, string]
       else
         [0, string]
       end
     end
 
-    # Ensure all explicit VS16 sequences have width 2
-    def self.emoji_width_basic(string)
-      res = 0
-
-      no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
-        if basic_emoji.size >= 2 # VS16 present
-          res += 2
-          ""
-        else
-          basic_emoji
-        end
-      }
-
-      [res, no_emoji_string]
-    end
-
     # Use simplistic ZWJ/modifier/kecap sequence matching
     def self.emoji_width_all(string)
       res = 0
@@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a
       no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
         # Skip notorious false positives
         if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
-          emoji_candidate
+          res += 1
+          ""
 
         # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
         elsif emoji_candidate == emoji_candidate[emoji_set_regex]
           if strict_eaw
-            res += self.of(emoji_candidate[0], ambiguous, emoji: false)
+            res += self.width_in_index(emoji_candidate[0].ord, INDEX[AMBIGUOUS_MAP[ambiguous]])
           else
             res += 2
           end
           ""
 
-        # We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
+        # Use other counting mechanisms
         else
-          if !strict_eaw
-            # Ensure all explicit VS16 sequences have width 2
-            emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
-              if basic_emoji.size == 2 # VS16 present
-                res += 2
-                ""
-              else
-                basic_emoji
-              end
-            }
-          end
-
           emoji_candidate
         end
       }