From 9dffe9573be86ac7302fb2f27219ef8f9c78dea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Thu, 18 Jul 2024 20:02:08 +0000 Subject: [PATCH] =?UTF-8?q?Make=20unicode=20text=20flow=20control=20chars?= =?UTF-8?q?=20visible=20as=20=EF=BF=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already point these out quite aggressively, telling people not to use them, but would normally be rendered as nothing. Having them visible will make it easier for people to actually deal with them. ``` error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:26:22 | LL | println!("{:?}", '�'); | ^-^ | || | |'\u{202e}' | this literal contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = help: if their presence wasn't intentional, you can remove them help: if you want to keep them but make them visible in your source code, you can escape them | LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ ``` vs the previous ``` error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:26:22 | LL | println!("{:?}", ''); | ^- | || | |'\u{202e}' | this literal contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = help: if their presence wasn't intentional, you can remove them help: if you want to keep them but make them visible in your source code, you can escape them | LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ ``` --- compiler/rustc_errors/src/emitter.rs | 21 ++-- compiler/rustc_span/src/lib.rs | 3 +- .../parser/unicode-control-codepoints.stderr | 98 +++++++++---------- 3 files changed, 62 insertions(+), 60 deletions(-) diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index 16fa0ff7a2d6c..58220c6549005 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -2558,18 +2558,19 @@ fn num_decimal_digits(num: usize) -> usize { } // We replace some characters so the CLI output is always consistent and underlines aligned. +// Keep the following list in sync with `rustc_span::char_width`. const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ - ('\t', " "), // We do our own tab replacement + ('\t', " "), // We do our own tab replacement ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters. - ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently - ('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk - ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always. - ('\u{202E}', ""), - ('\u{2066}', ""), - ('\u{2067}', ""), - ('\u{2068}', ""), - ('\u{202C}', ""), - ('\u{2069}', ""), + ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently + ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk + ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always. + ('\u{202E}', "�"), + ('\u{2066}', "�"), + ('\u{2067}', "�"), + ('\u{2068}', "�"), + ('\u{202C}', "�"), + ('\u{2069}', "�"), // In terminals without Unicode support the following will be garbled, but in *all* terminals // the underlying codepoint will be as well. We could gate this replacement behind a "unicode // support" gate. diff --git a/compiler/rustc_span/src/lib.rs b/compiler/rustc_span/src/lib.rs index ea57c1ce8185b..7c8ac3be4beca 100644 --- a/compiler/rustc_span/src/lib.rs +++ b/compiler/rustc_span/src/lib.rs @@ -2093,7 +2093,8 @@ pub fn char_width(ch: char) -> usize { | '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}' | '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}' | '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}' - | '\u{007F}' => 1, + | '\u{007F}' | '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' | '\u{2066}' + | '\u{2067}' | '\u{2068}' | '\u{202C}' | '\u{2069}' => 1, _ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1), } } diff --git a/tests/ui/parser/unicode-control-codepoints.stderr b/tests/ui/parser/unicode-control-codepoints.stderr index fc071a9419142..28de4ae72abbd 100644 --- a/tests/ui/parser/unicode-control-codepoints.stderr +++ b/tests/ui/parser/unicode-control-codepoints.stderr @@ -17,78 +17,78 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:26 | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); +LL | println!("{:?}", b"/*� } �if isAdmin� � begin admins only "); | ^ must be ASCII but is '\u{202e}' | help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes | -LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); +LL | println!("{:?}", b"/*\xE2\x80\xAE } �if isAdmin� � begin admins only "); | ~~~~~~~~~~~~ error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:30 | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' +LL | println!("{:?}", b"/*� } �if isAdmin� � begin admins only "); + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | -LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); - | ~~~~~~~~~~~~ +LL | println!("{:?}", b"/*� } \xE2\x81\xA6if isAdmin� � begin admins only "); + | ~~~~~~~~~~~~ error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:41 | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2069}' +LL | println!("{:?}", b"/*� } �if isAdmin� � begin admins only "); + | ^ must be ASCII but is '\u{2069}' | help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes | -LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); - | ~~~~~~~~~~~~ +LL | println!("{:?}", b"/*� } �if isAdmin\xE2\x81\xA9 � begin admins only "); + | ~~~~~~~~~~~~ error: non-ASCII character in byte string literal --> $DIR/unicode-control-codepoints.rs:16:43 | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' +LL | println!("{:?}", b"/*� } �if isAdmin� � begin admins only "); + | ^ must be ASCII but is '\u{2066}' | help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes | -LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); - | ~~~~~~~~~~~~ +LL | println!("{:?}", b"/*� } �if isAdmin� \xE2\x81\xA6 begin admins only "); + | ~~~~~~~~~~~~ error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:29 | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); +LL | println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##); | ^ must be ASCII but is '\u{202e}' error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:33 | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' +LL | println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##); + | ^ must be ASCII but is '\u{2066}' error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:44 | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2069}' +LL | println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##); + | ^ must be ASCII but is '\u{2069}' error: non-ASCII character in raw byte string literal --> $DIR/unicode-control-codepoints.rs:21:46 | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' +LL | println!("{:?}", br##"/*� } �if isAdmin� � begin admins only "##); + | ^ must be ASCII but is '\u{2066}' error: unicode codepoint changing visible direction of text present in comment --> $DIR/unicode-control-codepoints.rs:2:5 | -LL | // if access_level != "user" { // Check if admin - | ^^^^^^^^^^^^^^^^^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ - | | || - | | |'\u{202a}' +LL | // if access_level != "us�e�r" { // Check if admin + | ^^^^^^^^^^^^^^^^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^ + | | | | + | | | '\u{202a}' | | '\u{202b}' | this comment contains invisible unicode text flow control codepoints | @@ -99,12 +99,12 @@ LL | // if access_level != "user" { // Check if admin error: unicode codepoint changing visible direction of text present in comment --> $DIR/unicode-control-codepoints.rs:30:1 | -LL | //"/* } if isAdmin begin admins only */" - | ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ - | | | | || - | | | | |'\u{2066}' - | | | | '\u{2069}' - | | | '\u{2066}' +LL | //"/*� } �if isAdmin� � begin admins only */" + | ^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^ + | | | | | | + | | | | | '\u{2066}' + | | | | '\u{2069}' + | | | '\u{2066}' | | '\u{202e}' | this comment contains invisible unicode text flow control codepoints | @@ -114,12 +114,12 @@ LL | //"/* } if isAdmin begin admins only */" error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:11:22 | -LL | println!("{:?}", "/* } if isAdmin begin admins only "); - | ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^ - | | | | || - | | | | |'\u{2066}' - | | | | '\u{2069}' - | | | '\u{2066}' +LL | println!("{:?}", "/*� } �if isAdmin� � begin admins only "); + | ^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^ + | | | | | | + | | | | | '\u{2066}' + | | | | '\u{2069}' + | | | '\u{2066}' | | '\u{202e}' | this literal contains invisible unicode text flow control codepoints | @@ -134,12 +134,12 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:14:22 | -LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##); - | ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ - | | | | || - | | | | |'\u{2066}' - | | | | '\u{2069}' - | | | '\u{2066}' +LL | println!("{:?}", r##"/*� } �if isAdmin� � begin admins only "##); + | ^^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^ + | | | | | | + | | | | | '\u{2066}' + | | | | '\u{2069}' + | | | '\u{2066}' | | '\u{202e}' | this literal contains invisible unicode text flow control codepoints | @@ -153,8 +153,8 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b error: unicode codepoint changing visible direction of text present in literal --> $DIR/unicode-control-codepoints.rs:26:22 | -LL | println!("{:?}", ''); - | ^- +LL | println!("{:?}", '�'); + | ^-^ | || | |'\u{202e}' | this literal contains an invisible unicode text flow control codepoint @@ -169,8 +169,8 @@ LL | println!("{:?}", '\u{202e}'); error: unicode codepoint changing visible direction of text present in doc comment --> $DIR/unicode-control-codepoints.rs:33:1 | -LL | /** ''); */fn foo() {} - | ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint +LL | /** '�'); */fn foo() {} + | ^^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them @@ -181,8 +181,8 @@ error: unicode codepoint changing visible direction of text present in doc comme | LL | / /** LL | | * -LL | | * ''); */fn bar() {} - | |___________^ this doc comment contains an invisible unicode text flow control codepoint +LL | | * '�'); */fn bar() {} + | |____________^ this doc comment contains an invisible unicode text flow control codepoint | = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen = note: if their presence wasn't intentional, you can remove them