Skip to content

Commit

Permalink
Use non-regex approach for VS16 adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
janlelis committed Nov 18, 2024
1 parent 82b17bd commit 169ad17
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 47 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGELOG

## 3.1.1 (unreleased)

- Performance improvements

## 3.1.0

**Improve Emoji support:**
Expand Down
142 changes: 95 additions & 47 deletions lib/unicode/display_width.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,17 @@

module Unicode
class DisplayWidth
DEFAULT_AMBIGUOUS = 1
INITIAL_DEPTH = 0x10000
def self.width_in_index(codepoint, index)
d = INITIAL_DEPTH
w = index[codepoint / d]
while w.instance_of? Array
w = w[(codepoint %= d) / (d /= 16)]
end
w || 1
end

DEFAULT_AMBIGUOUS = 1
ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n\v\f\r\x0E\x0F]/
ASCII_NON_ZERO_STRING = "\0\x05\a\b\n\v\f\r\x0E\x0F"
ASCII_BACKSPACE = "\b"
Expand All @@ -25,11 +34,19 @@ class DisplayWidth
WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
}
VS16_TEXT_CODEPOINTS = {
WIDTH_ONE: Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT,
WIDTH_TWO: (Unicode::Emoji::TEXT_PRESENTATION - Unicode::Emoji::EMOJI_COMPONENT).reject{ |codepoint|
width_in_index(codepoint, INDEX[:WIDTH_TWO]) == 2
},
}
EMOJI_SEQUENCES_REGEX_MAPPING = {
rgi: :REGEX_INCLUDE_MQE_UQE,
rgi_at: :REGEX_INCLUDE_MQE_UQE,
possible: :REGEX_WELL_FORMED,
}
EMOJI_NON_VS16_OPTIONS = [:all_no_vs16, :rgi_at, :none, false]
VS16 = 0xFE0F
REGEX_EMOJI_BASIC_OR_KEYCAP = Regexp.union(Unicode::Emoji::REGEX_BASIC, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
REGEX_EMOJI_NOT_POSSIBLE = /\A[#*0-9]\z/
Expand Down Expand Up @@ -61,14 +78,25 @@ def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **option
# # #

if !options[:overwrite].empty?
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
width_all_features(string, index_full, index_low, first_ambiguous, options[:overwrite])
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
width_all_features(
string,
index_full,
index_low,
first_ambiguous,
options[:overwrite],
EMOJI_NON_VS16_OPTIONS.include?(options[:emoji]) ? nil : vs16_text_codepoints
)
end
end

if !string.ascii_only?
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous|
width_no_overwrite(string, index_full, index_low, first_ambiguous)
return width_frame(string, options) do |string, index_full, index_low, first_ambiguous, vs16_text_codepoints|
if EMOJI_NON_VS16_OPTIONS.include?(options[:emoji])
width_no_overwrite(string, index_full, index_low, first_ambiguous)
else
width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
end
end
end

Expand Down Expand Up @@ -102,7 +130,13 @@ def self.width_frame(string, options)
ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]

# Get general width
res += yield(string, INDEX[ambiguous_index_name], FIRST_4096[ambiguous_index_name], FIRST_AMBIGUOUS[ambiguous_index_name])
res += yield(
string,
INDEX[ambiguous_index_name],
FIRST_4096[ambiguous_index_name],
FIRST_AMBIGUOUS[ambiguous_index_name],
VS16_TEXT_CODEPOINTS[ambiguous_index_name]
)

# Return result + prevent negative lengths
res < 0 ? 0 : res
Expand Down Expand Up @@ -139,26 +173,73 @@ def self.width_no_overwrite(string, index_full, index_low, first_ambiguous, _ =
res
end

def self.width_no_overwrite_with_vs16(string, index_full, index_low, first_ambiguous, vs16_text_codepoints)
res = 0

# Make sure we have UTF-8
string = string.encode(Encoding::UTF_8) unless string.encoding.name == "utf-8"

# Track last codepoint and apply VS16 adjustment if necassary
last_codepoint = nil

string.scan(/.{,80}/m){ |batch|
if batch.ascii_only?
res += batch.size
else
batch.each_codepoint{ |codepoint|
if codepoint > 15 && codepoint < first_ambiguous
res += 1
elsif codepoint < 0x1001
res += index_low[codepoint] || 1
elsif codepoint == VS16 && vs16_text_codepoints.include?(last_codepoint)
res += 1
else
d = INITIAL_DEPTH
c = codepoint
w = index_full[c / d]
while w.instance_of? Array
w = w[(c %= d) / (d /= 16)]
end

res += w || 1
end

last_codepoint = codepoint
}
end
}

res
end

# Same as .width_no_overwrite - but with applying overwrites for each char
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite)
def self.width_all_features(string, index_full, index_low, first_ambiguous, overwrite, vs16_text_codepoints)
res = 0

# Track last codepoint and apply VS16 adjustment if necassary
last_codepoint = nil

string.each_codepoint{ |codepoint|
if overwrite[codepoint]
res += overwrite[codepoint]
elsif codepoint > 15 && codepoint < first_ambiguous
res += 1
elsif codepoint < 0x1001
res += index_low[codepoint] || 1
elsif codepoint == VS16 && vs16_text_codepoints && vs16_text_codepoints.include?(last_codepoint)
res += 1
else
d = INITIAL_DEPTH
w = index_full[codepoint / d]
c = codepoint
w = index_full[c / d]
while w.instance_of? Array
w = w[(codepoint %= d) / (d /= 16)]
w = w[(c %= d) / (d /= 16)]
end

res += w || 1
end

last_codepoint = codepoint
}

res
Expand All @@ -177,35 +258,13 @@ def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
mode == :rgi_at,
ambiguous,
)
elsif mode == :all_no_vs16
elsif mode == :all_no_vs16 || mode == :all
emoji_width_all(string)
elsif mode == :vs16
emoji_width_basic(string)
elsif mode == :all
res_all, string = emoji_width_all(string)
res_basic, string = emoji_width_basic(string)
[res_all + res_basic, string]
else
[0, string]
end
end

# Ensure all explicit VS16 sequences have width 2
def self.emoji_width_basic(string)
res = 0

no_emoji_string = string.gsub(REGEX_EMOJI_BASIC_OR_KEYCAP){ |basic_emoji|
if basic_emoji.size >= 2 # VS16 present
res += 2
""
else
basic_emoji
end
}

[res, no_emoji_string]
end

# Use simplistic ZWJ/modifier/kecap sequence matching
def self.emoji_width_all(string)
res = 0
Expand All @@ -226,31 +285,20 @@ def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, a
no_emoji_string = string.gsub(Unicode::Emoji::REGEX_POSSIBLE){ |emoji_candidate|
# Skip notorious false positives
if REGEX_EMOJI_NOT_POSSIBLE.match?(emoji_candidate)
emoji_candidate
res += 1
""

# Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
elsif emoji_candidate == emoji_candidate[emoji_set_regex]
if strict_eaw
res += self.of(emoji_candidate[0], ambiguous, emoji: false)
res += self.width_in_index(emoji_candidate[0].ord, INDEX[AMBIGUOUS_MAP[ambiguous]])
else
res += 2
end
""

# We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
# Use other counting mechanisms
else
if !strict_eaw
# Ensure all explicit VS16 sequences have width 2
emoji_candidate.gsub!(Unicode::Emoji::REGEX_BASIC){ |basic_emoji|
if basic_emoji.size == 2 # VS16 present
res += 2
""
else
basic_emoji
end
}
end

emoji_candidate
end
}
Expand Down

0 comments on commit 169ad17

Please sign in to comment.