diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 65ae483183901..a93b94867ce4c 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -927,7 +927,7 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - self > '\x7f' && unicode::Grapheme_Extend(self) + unicode::Grapheme_Extend(self) } /// Returns `true` if this `char` has one of the general categories for numbers. diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index dd2ad9a58f679..1b3d6729663b5 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -315,7 +315,11 @@ pub mod grapheme_extend { 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 100, 1, 160, 7, 0, 1, 61, 4, 0, 4, 0, 7, 109, 7, 0, 96, 128, 240, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + (c as u32) >= 0x300 && lookup_slow(c) + } + fn lookup_slow(c: char) -> bool { super::skip_search( c as u32, &SHORT_OFFSET_RUNS, diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 7547b49ab2a54..ef5cea18ea2d7 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -23,6 +23,7 @@ impl RawEmitter { } fn emit_bitset(&mut self, ranges: &[Range]) -> Result<(), String> { + let first_code_point = ranges.first().unwrap().start; let last_code_point = ranges.last().unwrap().end; // bitset for every bit in the codepoint range // @@ -101,7 +102,10 @@ impl RawEmitter { ) .unwrap(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " super::bitset_search(",).unwrap(); + if first_code_point > 0x7f { + writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); + } + writeln!(&mut self.file, " super::bitset_search(").unwrap(); writeln!(&mut self.file, " c as u32,").unwrap(); writeln!(&mut self.file, " &BITSET_CHUNKS_MAP,").unwrap(); writeln!(&mut self.file, " &BITSET_INDEX_CHUNKS,").unwrap(); diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 9b613a94c5795..8fae8289e251d 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -25,8 +25,9 @@ impl ShortOffsetRunHeader { impl RawEmitter { pub fn emit_skiplist(&mut self, ranges: &[Range]) { + let first_code_point = ranges.first().unwrap().start; let mut offsets = Vec::::new(); - let points = ranges.iter().flat_map(|r| vec![r.start, r.end]).collect::>(); + let points = ranges.iter().flat_map(|r| [r.start, r.end]).collect::>(); let mut offset = 0; for pt in points { let delta = pt - offset; @@ -86,7 +87,26 @@ impl RawEmitter { .unwrap(); self.bytes_used += coded_offsets.len(); - writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + // The inlining in this code works like the following: + // + // The `skip_search` function is always inlined into the parent `lookup` fn, + // thus the compiler can generate optimal code based on the referenced `static`s. + // + // In the case of ASCII optimization, the lower-bounds check is inlined into + // the caller, and slower-path `skip_search` is outlined into a separate `lookup_slow` fn. + // + // Thus, in both cases, the `skip_search` function is specialized for the `static`s, + // and outlined into the prebuilt `std`. + if first_code_point > 0x7f { + writeln!(&mut self.file, "#[inline]").unwrap(); + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") + .unwrap(); + writeln!(&mut self.file, "}}").unwrap(); + writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); + } else { + writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + } writeln!(&mut self.file, " super::skip_search(",).unwrap(); writeln!(&mut self.file, " c as u32,").unwrap(); writeln!(&mut self.file, " &SHORT_OFFSET_RUNS,").unwrap();