Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More unescaping cleanups #118897

Merged
merged 5 commits into from
Dec 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 30 additions & 44 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ impl LitKind {
// new symbol because the string in the LitKind is different to the
// string in the token.
let s = symbol.as_str();
// Vanilla strings are so common we optimize for the common case where no chars
// requiring special behaviour are present.
let symbol = if s.contains(['\\', '\r']) {
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
Expand Down Expand Up @@ -104,27 +106,20 @@ impl LitKind {
LitKind::Str(symbol, ast::StrStyle::Cooked)
}
token::StrRaw(n) => {
// Ditto.
let s = symbol.as_str();
let symbol =
if s.contains('\r') {
let mut buf = String::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
// Raw strings have no escapes, so we only need to check for invalid chars, and we
// can reuse the symbol on success.
let mut error = Ok(());
unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
match unescaped_char {
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
});
error?;
Symbol::intern(&buf)
} else {
symbol
};
}
}
});
error?;
LitKind::Str(symbol, ast::StrStyle::Raw(n))
}
token::ByteStr => {
Expand All @@ -143,25 +138,19 @@ impl LitKind {
LitKind::ByteStr(buf.into(), StrStyle::Cooked)
}
token::ByteStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we
// can convert the symbol directly to a `Lrc<u8>` on success.
let s = symbol.as_str();
let bytes = if s.contains('\r') {
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
let mut error = Ok(());
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
});
error?;
buf
} else {
symbol.to_string().into_bytes()
};

LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
}
});
LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
}
token::CStr => {
let s = symbol.as_str();
Expand All @@ -172,7 +161,6 @@ impl LitKind {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Expand All @@ -187,25 +175,23 @@ impl LitKind {
LitKind::CStr(buf.into(), StrStyle::Cooked)
}
token::CStrRaw(n) => {
// Raw strings have no escapes, so we only need to check for invalid chars, and we
// can convert the symbol directly to a `Lrc<u8>` on success.
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
let mut error = Ok(());
unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
error = Err(LitError::NulInCStr(span));
}
Ok(CStrUnit::Byte(b)) => buf.push(b),
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
Ok(CStrUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Ok(_) => {}
Err(err) => {
if err.is_fatal() {
error = Err(LitError::LexerError);
}
}
});
error?;
let mut buf = s.to_owned().into_bytes();
buf.push(0);
LitKind::CStr(buf.into(), StrStyle::Raw(n))
}
Expand Down
21 changes: 10 additions & 11 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ where
let res = unescape_char_or_byte(&mut chars, mode);
callback(0..(src.len() - chars.as_str().len()), res);
}
Str | ByteStr => unescape_str_common(src, mode, callback),
RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
RawStr | RawByteStr => check_raw_common(src, mode, callback),
CStr | RawCStr => unreachable!(),
}
}
Expand Down Expand Up @@ -122,12 +122,10 @@ where
{
match mode {
CStr => {
unescape_str_common(src, mode, callback);
unescape_non_raw_common(src, mode, callback);
}
RawCStr => {
unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
callback(r, result.map(CStrUnit::Char))
});
check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char)));
}
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
}
Expand Down Expand Up @@ -191,8 +189,9 @@ impl Mode {
/// Byte literals do not allow unicode escape.
fn is_unicode_escape_disallowed(self) -> bool {
match self {
Byte | ByteStr | RawByteStr => true,
Char | Str | RawStr | CStr | RawCStr => false,
Byte | ByteStr => true,
Char | Str | CStr => false,
RawByteStr | RawStr | RawCStr => unreachable!(),
}
}

Expand Down Expand Up @@ -324,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca

/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<T, EscapeError>),
{
Expand Down Expand Up @@ -391,15 +390,15 @@ where
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only produce errors on bare CR.
fn unescape_raw_str_or_raw_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut chars = src.chars();
let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop

// The `start` and `end` computation here matches the one in
// `unescape_str_common` for consistency, even though this function
// `unescape_non_raw_common` for consistency, even though this function
// doesn't have to worry about skipping any chars.
while let Some(c) = chars.next() {
let start = src.len() - chars.as_str().len() - c.len_utf8();
Expand Down
75 changes: 42 additions & 33 deletions compiler/rustc_parse/src/lexer/unescape_error_reporting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,34 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub,

pub(crate) fn emit_unescape_error(
handler: &Handler,
// interior part of the literal, without quotes
// interior part of the literal, between quotes
lit: &str,
// full span of the literal, including quotes
span_with_quotes: Span,
// interior span of the literal, without quotes
span: Span,
// full span of the literal, including quotes and any prefix
full_lit_span: Span,
// span of the error part of the literal
err_span: Span,
mode: Mode,
// range of the error inside `lit`
range: Range<usize>,
error: EscapeError,
) {
debug!(
"emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
lit, span_with_quotes, mode, range, error
lit, full_lit_span, mode, range, error
);
let last_char = || {
let c = lit[range.clone()].chars().next_back().unwrap();
let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32));
let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32));
(c, span)
};
match error {
EscapeError::LoneSurrogateUnicodeEscape => {
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true });
handler
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true });
}
EscapeError::OutOfRangeUnicodeEscape => {
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false });
handler
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false });
}
EscapeError::MoreThanOneChar => {
use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
Expand All @@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error(
let normalized = lit.nfc().to_string();
if normalized.chars().count() == 1 {
let ch = normalized.chars().next().unwrap().escape_default().to_string();
sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized });
sugg = Some(MoreThanOneCharSugg::NormalizedForm {
span: err_span,
ch,
normalized,
});
}
let escaped_marks =
rest.iter().map(|c| c.escape_default().to_string()).collect::<Vec<_>>();
note = Some(MoreThanOneCharNote::AllCombining {
span,
span: err_span,
chr: format!("{first}"),
len: escaped_marks.len(),
escaped_marks: escaped_marks.join(""),
Expand All @@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error(
.collect();

if let &[ch] = printable.as_slice() {
sugg =
Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() });
sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting {
span: err_span,
ch: ch.to_string(),
});
note = Some(MoreThanOneCharNote::NonPrinting {
span,
span: err_span,
escaped: lit.escape_default().to_string(),
});
}
Expand All @@ -91,21 +99,21 @@ pub(crate) fn emit_unescape_error(
}
let sugg = format!("{prefix}\"{escaped}\"");
MoreThanOneCharSugg::Quotes {
span: span_with_quotes,
span: full_lit_span,
is_byte: mode == Mode::Byte,
sugg,
}
});
handler.emit_err(UnescapeError::MoreThanOneChar {
span: span_with_quotes,
span: full_lit_span,
note,
suggestion: sugg,
});
}
EscapeError::EscapeOnlyChar => {
let (c, char_span) = last_char();
handler.emit_err(UnescapeError::EscapeOnlyChar {
span,
span: err_span,
char_span,
escaped_sugg: c.escape_default().to_string(),
escaped_msg: escaped_char(c),
Expand All @@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error(
}
EscapeError::BareCarriageReturn => {
let double_quotes = mode.in_double_quotes();
handler.emit_err(UnescapeError::BareCr { span, double_quotes });
handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes });
}
EscapeError::BareCarriageReturnInRawString => {
assert!(mode.in_double_quotes());
handler.emit_err(UnescapeError::BareCrRawString(span));
handler.emit_err(UnescapeError::BareCrRawString(err_span));
}
EscapeError::InvalidEscape => {
let (c, span) = last_char();
Expand All @@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error(
} else {
if mode == Mode::Str || mode == Mode::Char {
diag.span_suggestion(
span_with_quotes,
full_lit_span,
"if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal",
format!("r\"{lit}\""),
Applicability::MaybeIncorrect,
Expand All @@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error(
diag.emit();
}
EscapeError::TooShortHexEscape => {
handler.emit_err(UnescapeError::TooShortHexEscape(span));
handler.emit_err(UnescapeError::TooShortHexEscape(err_span));
}
EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => {
let (c, span) = last_char();
Expand Down Expand Up @@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error(
err.emit();
}
EscapeError::OutOfRangeHexEscape => {
handler.emit_err(UnescapeError::OutOfRangeHexEscape(span));
handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span));
}
EscapeError::LeadingUnderscoreUnicodeEscape => {
let (c, span) = last_char();
Expand All @@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error(
});
}
EscapeError::OverlongUnicodeEscape => {
handler.emit_err(UnescapeError::OverlongUnicodeEscape(span));
handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span));
}
EscapeError::UnclosedUnicodeEscape => {
handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi()));
handler
.emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi()));
}
EscapeError::NoBraceInUnicodeEscape => {
let mut suggestion = "\\u{".to_owned();
Expand All @@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error(
let (label, sub) = if suggestion_len > 0 {
suggestion.push('}');
let hi = char_span.lo() + BytePos(suggestion_len as u32);
(None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion })
(None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion })
} else {
(Some(span), NoBraceUnicodeSub::Help)
(Some(err_span), NoBraceUnicodeSub::Help)
};
handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub });
handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub });
}
EscapeError::UnicodeEscapeInByte => {
handler.emit_err(UnescapeError::UnicodeEscapeInByte(span));
handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span));
}
EscapeError::EmptyUnicodeEscape => {
handler.emit_err(UnescapeError::EmptyUnicodeEscape(span));
handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span));
}
EscapeError::ZeroChars => {
handler.emit_err(UnescapeError::ZeroChars(span));
handler.emit_err(UnescapeError::ZeroChars(err_span));
}
EscapeError::LoneSlash => {
handler.emit_err(UnescapeError::LoneSlash(span));
handler.emit_err(UnescapeError::LoneSlash(err_span));
}
EscapeError::UnskippedWhitespaceWarning => {
let (c, char_span) = last_char();
handler.emit_warning(UnescapeError::UnskippedWhitespace {
span,
span: err_span,
ch: escaped_char(c),
char_span,
});
}
EscapeError::MultipleSkippedLinesWarning => {
handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span));
handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span));
}
}
}
Expand Down
Loading