From 2e1a50121ef265214c5e2a7d82fe40b4928575ab Mon Sep 17 00:00:00 2001 From: Corey Richardson Date: Tue, 2 Dec 2014 16:48:48 -0800 Subject: [PATCH] syntax: support ES6-style unicode escapes First half of bootstrapping /~https://github.com/rust-lang/rfcs/pull/446 --- src/libsyntax/parse/lexer/mod.rs | 81 ++++++++++++++++++- src/libsyntax/parse/mod.rs | 22 +++-- .../compile-fail/new-unicode-escapes-1.rs | 13 +++ .../compile-fail/new-unicode-escapes-2.rs | 13 +++ .../compile-fail/new-unicode-escapes-3.rs | 13 +++ .../compile-fail/new-unicode-escapes-4.rs | 13 +++ src/test/run-pass/new-unicode-escapes.rs | 22 +++++ 7 files changed, 169 insertions(+), 8 deletions(-) create mode 100644 src/test/compile-fail/new-unicode-escapes-1.rs create mode 100644 src/test/compile-fail/new-unicode-escapes-2.rs create mode 100644 src/test/compile-fail/new-unicode-escapes-3.rs create mode 100644 src/test/compile-fail/new-unicode-escapes-4.rs create mode 100644 src/test/run-pass/new-unicode-escapes.rs diff --git a/src/libsyntax/parse/lexer/mod.rs b/src/libsyntax/parse/lexer/mod.rs index 57983a6dee6be..27b65e0f52798 100644 --- a/src/libsyntax/parse/lexer/mod.rs +++ b/src/libsyntax/parse/lexer/mod.rs @@ -764,6 +764,15 @@ impl<'a> StringReader<'a> { } } + // SNAP c9f6d69 + #[allow(unused)] + fn old_escape_warning(&mut self, sp: Span) { + self.span_diagnostic + .span_warn(sp, "\\U00ABCD12 and \\uABCD escapes are deprecated"); + self.span_diagnostic + .span_help(sp, "use \\u{ABCD12} escapes instead"); + } + /// Scan for a single (possibly escaped) byte or char /// in a byte, (non-raw) byte string, char, or (non-raw) string literal. /// `start` is the position of `first_source_char`, which is already consumed. @@ -782,12 +791,24 @@ impl<'a> StringReader<'a> { Some(e) => { return match e { 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true, - 'x' => self.scan_hex_digits(2u, delim, !ascii_only), + 'x' => self.scan_byte_escape(delim, !ascii_only), 'u' if !ascii_only => { - self.scan_hex_digits(4u, delim, false) + if self.curr == Some('{') { + self.scan_unicode_escape(delim) + } else { + let res = self.scan_hex_digits(4u, delim, false); + // SNAP c9f6d69 + //let sp = codemap::mk_sp(escaped_pos, self.last_pos); + //self.old_escape_warning(sp); + res + } } 'U' if !ascii_only => { - self.scan_hex_digits(8u, delim, false) + let res = self.scan_hex_digits(8u, delim, false); + // SNAP c9f6d69 + //let sp = codemap::mk_sp(escaped_pos, self.last_pos); + //self.old_escape_warning(sp); + res } '\n' if delim == '"' => { self.consume_whitespace(); @@ -848,6 +869,56 @@ impl<'a> StringReader<'a> { true } + /// Scan over a \u{...} escape + /// + /// At this point, we have already seen the \ and the u, the { is the current character. We + /// will read at least one digit, and up to 6, and pass over the }. + fn scan_unicode_escape(&mut self, delim: char) -> bool { + self.bump(); // past the { + let start_bpos = self.last_pos; + let mut count: uint = 0; + let mut accum_int = 0; + + while !self.curr_is('}') && count <= 6 { + let c = match self.curr { + Some(c) => c, + None => { + self.fatal_span_(start_bpos, self.last_pos, + "unterminated unicode escape (found EOF)"); + } + }; + accum_int *= 16; + accum_int += c.to_digit(16).unwrap_or_else(|| { + if c == delim { + self.fatal_span_(self.last_pos, self.pos, + "unterminated unicode escape (needed a `}`)"); + } else { + self.fatal_span_char(self.last_pos, self.pos, + "illegal character in unicode escape", c); + } + }) as u32; + self.bump(); + count += 1; + } + + if count > 6 { + self.fatal_span_(start_bpos, self.last_pos, + "overlong unicode escape (can have at most 6 hex digits)"); + } + + self.bump(); // past the ending } + + let mut valid = count >= 1 && count <= 6; + if char::from_u32(accum_int).is_none() { + valid = false; + } + + if !valid { + self.fatal_span_(start_bpos, self.last_pos, "illegal unicode character escape"); + } + valid + } + /// Scan over a float exponent. fn scan_float_exponent(&mut self) { if self.curr_is('e') || self.curr_is('E') { @@ -1273,6 +1344,10 @@ impl<'a> StringReader<'a> { return token::Byte(id); } + fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool { + self.scan_hex_digits(2, delim, below_0x7f_only) + } + fn scan_byte_string(&mut self) -> token::Lit { self.bump(); let start = self.last_pos; diff --git a/src/libsyntax/parse/mod.rs b/src/libsyntax/parse/mod.rs index b46f7cdfe22ad..8d0c2de048a56 100644 --- a/src/libsyntax/parse/mod.rs +++ b/src/libsyntax/parse/mod.rs @@ -393,16 +393,28 @@ pub fn char_lit(lit: &str) -> (char, int) { let msg = format!("lexer should have rejected a bad character escape {}", lit); let msg2 = msg.as_slice(); - let esc: |uint| -> Option<(char, int)> = |len| + fn esc(len: uint, lit: &str) -> Option<(char, int)> { num::from_str_radix(lit.slice(2, len), 16) .and_then(char::from_u32) - .map(|x| (x, len as int)); + .map(|x| (x, len as int)) + } + + let unicode_escape: || -> Option<(char, int)> = || + if lit.as_bytes()[2] == b'{' { + let idx = lit.find('}').expect(msg2); + let subslice = lit.slice(3, idx); + num::from_str_radix(subslice, 16) + .and_then(char::from_u32) + .map(|x| (x, subslice.char_len() as int + 4)) + } else { + esc(6, lit) + }; // Unicode escapes return match lit.as_bytes()[1] as char { - 'x' | 'X' => esc(4), - 'u' => esc(6), - 'U' => esc(10), + 'x' | 'X' => esc(4, lit), + 'u' => unicode_escape(), + 'U' => esc(10, lit), _ => None, }.expect(msg2); } diff --git a/src/test/compile-fail/new-unicode-escapes-1.rs b/src/test/compile-fail/new-unicode-escapes-1.rs new file mode 100644 index 0000000000000..f2422830a21cc --- /dev/null +++ b/src/test/compile-fail/new-unicode-escapes-1.rs @@ -0,0 +1,13 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub fn main() { + let s = "\u{2603"; //~ ERROR unterminated unicode escape (needed a `}`) +} diff --git a/src/test/compile-fail/new-unicode-escapes-2.rs b/src/test/compile-fail/new-unicode-escapes-2.rs new file mode 100644 index 0000000000000..5da8674c37ea5 --- /dev/null +++ b/src/test/compile-fail/new-unicode-escapes-2.rs @@ -0,0 +1,13 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub fn main() { + let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits) +} diff --git a/src/test/compile-fail/new-unicode-escapes-3.rs b/src/test/compile-fail/new-unicode-escapes-3.rs new file mode 100644 index 0000000000000..7c64d02efd746 --- /dev/null +++ b/src/test/compile-fail/new-unicode-escapes-3.rs @@ -0,0 +1,13 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub fn main() { + let s = "\u{d805}"; //~ ERROR illegal unicode character escape +} diff --git a/src/test/compile-fail/new-unicode-escapes-4.rs b/src/test/compile-fail/new-unicode-escapes-4.rs new file mode 100644 index 0000000000000..ffc2b11e0c13c --- /dev/null +++ b/src/test/compile-fail/new-unicode-escapes-4.rs @@ -0,0 +1,13 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub fn main() { + let s = "\u{lol}"; //~ ERROR illegal character in unicode escape +} diff --git a/src/test/run-pass/new-unicode-escapes.rs b/src/test/run-pass/new-unicode-escapes.rs new file mode 100644 index 0000000000000..2888389bcceab --- /dev/null +++ b/src/test/run-pass/new-unicode-escapes.rs @@ -0,0 +1,22 @@ +// Copyright 2014 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +pub fn main() { + let s = "\u{2603}"; + assert_eq!(s, "☃"); + + let s = "\u{2a10}\u{2A01}\u{2Aa0}"; + assert_eq!(s, "⨐⨁⪠"); + + let s = "\\{20}"; + let mut correct_s = String::from_str("\\"); + correct_s.push_str("{20}"); + assert_eq!(s, correct_s.as_slice()); +}