rust-lang · bors · Aug 25, 2016 · Aug 23, 2016 · Aug 23, 2016 · Aug 23, 2016
diff --git a/src/libcore/char.rs b/src/libcore/char.rs
@@ -752,25 +752,81 @@ pub struct InvalidSequence(());
 impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
     type Item = Result<char, InvalidSequence>;
     #[inline]
+
     fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
-        self.0.next().map(|b| {
-            if b & 0x80 == 0 { Ok(b as char) } else {
-                let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
-                if l < 2 || l > 6 { return Err(InvalidSequence(())) };
-                let mut x = (b as u32) & (0x7F >> l);
-                for _ in 0..l-1 {
+        self.0.next().map(|first_byte| {
+            // Emit InvalidSequence according to
+            // Unicode §5.22 Best Practice for U+FFFD Substitution
+            // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630
+
+            // Roughly: consume at least one byte,
+            // then validate one byte at a time and stop before the first unexpected byte
+            // (which might be the valid start of the next byte sequence).
+
+            let mut code_point;
+            macro_rules! first_byte {
+                ($mask: expr) => {
+                    code_point = u32::from(first_byte & $mask)
+                }
+            }
+            macro_rules! continuation_byte {
+                () => { continuation_byte!(0x80...0xBF) };
+                ($range: pat) => {
                     match self.0.peek() {
-                        Some(&b) if b & 0xC0 == 0x80 => {
+                        Some(&byte @ $range) => {
+                            code_point = (code_point << 6) | u32::from(byte & 0b0011_1111);
                             self.0.next();
-                            x = (x << 6) | (b as u32) & 0x3F;
-                        },
-                        _ => return Err(InvalidSequence(())),
+                        }
+                        _ => return Err(InvalidSequence(()))
                     }
                 }
-                match from_u32(x) {
-                    Some(x) if l == x.len_utf8() => Ok(x),
-                    _ => Err(InvalidSequence(())),
+            }
+
+            match first_byte {
+                0x00...0x7F => {
+                    first_byte!(0b1111_1111);
+                }
+                0xC2...0xDF => {
+                    first_byte!(0b0001_1111);
+                    continuation_byte!();
+                }
+                0xE0 => {
+                    first_byte!(0b0000_1111);
+                    continuation_byte!(0xA0...0xBF);  // 0x80...0x9F here are overlong
+                    continuation_byte!();
                 }
+                0xE1...0xEC | 0xEE...0xEF => {
+                    first_byte!(0b0000_1111);
+                    continuation_byte!();
+                    continuation_byte!();
+                }
+                0xED => {
+                    first_byte!(0b0000_1111);
+                    continuation_byte!(0x80...0x9F);  // 0xA0..0xBF here are surrogates
+                    continuation_byte!();
+                }
+                0xF0 => {
+                    first_byte!(0b0000_0111);
+                    continuation_byte!(0x90...0xBF);  // 0x80..0x8F here are overlong
+                    continuation_byte!();
+                    continuation_byte!();
+                }
+                0xF1...0xF3 => {
+                    first_byte!(0b0000_0111);
+                    continuation_byte!();
+                    continuation_byte!();
+                    continuation_byte!();
+                }
+                0xF4 => {
+                    first_byte!(0b0000_0111);
+                    continuation_byte!(0x80...0x8F);  // 0x90..0xBF here are beyond char::MAX
+                    continuation_byte!();
+                    continuation_byte!();
+                }
+                _ => return Err(InvalidSequence(()))  // Illegal first byte, overlong, or beyond MAX
+            }
+            unsafe {
+                Ok(from_u32_unchecked(code_point))
             }
         })
     }

diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs
@@ -358,29 +358,50 @@ fn eu_iterator_specializations() {
 
 #[test]
 fn test_decode_utf8() {
-    use core::char::*;
-    use core::iter::FromIterator;
-
-    for &(str, bs) in [("", &[] as &[u8]),
-                       ("A", &[0x41u8] as &[u8]),
-                       ("�", &[0xC1u8, 0x81u8] as &[u8]),
-                       ("♥", &[0xE2u8, 0x99u8, 0xA5u8]),
-                       ("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
-                       ("�", &[0xE2u8, 0x99u8] as &[u8]),
-                       ("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
-                       ("�", &[0xC0u8] as &[u8]),
-                       ("�A", &[0xC0u8, 0x41u8] as &[u8]),
-                       ("�", &[0x80u8] as &[u8]),
-                       ("�A", &[0x80u8, 0x41u8] as &[u8]),
-                       ("�", &[0xFEu8] as &[u8]),
-                       ("�A", &[0xFEu8, 0x41u8] as &[u8]),
-                       ("�", &[0xFFu8] as &[u8]),
-                       ("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
-        assert!(Iterator::eq(str.chars(),
-                             decode_utf8(bs.into_iter().map(|&b|b))
-                                 .map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
-                "chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
-                Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b))
-                                   .map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
+    macro_rules! assert_decode_utf8 {
+        ($input_bytes: expr, $expected_str: expr) => {
+            let input_bytes: &[u8] = &$input_bytes;
+            let s = char::decode_utf8(input_bytes.iter().cloned())
+                .map(|r_b| r_b.unwrap_or('\u{FFFD}'))
+                .collect::<String>();
+            assert_eq!(s, $expected_str,
+                       "input bytes: {:?}, expected str: {:?}, result: {:?}",
+                       input_bytes, $expected_str, s);
+            assert_eq!(String::from_utf8_lossy(&$input_bytes), $expected_str);
+        }
     }
+
+    assert_decode_utf8!([], "");
+    assert_decode_utf8!([0x41], "A");
+    assert_decode_utf8!([0xC1, 0x81], "��");
+    assert_decode_utf8!([0xE2, 0x99, 0xA5], "♥");
+    assert_decode_utf8!([0xE2, 0x99, 0xA5, 0x41], "♥A");
+    assert_decode_utf8!([0xE2, 0x99], "�");
+    assert_decode_utf8!([0xE2, 0x99, 0x41], "�A");
+    assert_decode_utf8!([0xC0], "�");
+    assert_decode_utf8!([0xC0, 0x41], "�A");
+    assert_decode_utf8!([0x80], "�");
+    assert_decode_utf8!([0x80, 0x41], "�A");
+    assert_decode_utf8!([0xFE], "�");
+    assert_decode_utf8!([0xFE, 0x41], "�A");
+    assert_decode_utf8!([0xFF], "�");
+    assert_decode_utf8!([0xFF, 0x41], "�A");
+    assert_decode_utf8!([0xC0, 0x80], "��");
+
+    // Surrogates
+    assert_decode_utf8!([0xED, 0x9F, 0xBF], "\u{D7FF}");
+    assert_decode_utf8!([0xED, 0xA0, 0x80], "���");
+    assert_decode_utf8!([0xED, 0xBF, 0x80], "���");
+    assert_decode_utf8!([0xEE, 0x80, 0x80], "\u{E000}");
+
+    // char::MAX
+    assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0xBF], "\u{10FFFF}");
+    assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0x41], "�A");
+    assert_decode_utf8!([0xF4, 0x90, 0x80, 0x80], "����");
+
+    // 5 and 6 bytes sequence
+    // Part of the original design of UTF-8,
+    // but invalid now that UTF-8 is artificially restricted to match the range of UTF-16.
+    assert_decode_utf8!([0xF8, 0x80, 0x80, 0x80, 0x80], "�����");
+    assert_decode_utf8!([0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], "������");
 }
diff --git a/src/librbml/lib.rs b/src/librbml/lib.rs
@@ -173,12 +173,12 @@ impl<'doc> Doc<'doc> {
         self.start == self.end
     }
 
-    pub fn as_str_slice(&self) -> &'doc str {
+    pub fn as_str(&self) -> &'doc str {
         str::from_utf8(&self.data[self.start..self.end]).unwrap()
     }
 
-    pub fn as_str(&self) -> String {
-        self.as_str_slice().to_string()
+    pub fn to_string(&self) -> String {
+        self.as_str().to_string()
     }
 }
 
@@ -773,7 +773,7 @@ pub mod reader {
             Ok(char::from_u32(doc_as_u32(self.next_doc(EsChar)?)).unwrap())
         }
         fn read_str(&mut self) -> DecodeResult<String> {
-            Ok(self.next_doc(EsStr)?.as_str())
+            Ok(self.next_doc(EsStr)?.to_string())
         }
 
         // Compound types:

diff --git a/src/librustc/lint/context.rs b/src/librustc/lint/context.rs
@@ -601,7 +601,7 @@ pub trait LintContext: Sized {
             for (lint_id, level, span) in v {
                 let (now, now_source) = self.lints().get_level_source(lint_id);
                 if now == Forbid && level != Forbid {
-                    let lint_name = lint_id.as_str();
+                    let lint_name = lint_id.to_string();
                     let mut diag_builder = struct_span_err!(self.sess(), span, E0453,
                                                             "{}({}) overruled by outer forbid({})",
                                                             level.as_str(), lint_name,
@@ -1216,7 +1216,7 @@ pub fn check_crate<'a, 'tcx>(tcx: TyCtxt<'a, 'tcx, 'tcx>,
         for &(lint, span, ref msg) in v {
             span_bug!(span,
                       "unprocessed lint {} at {}: {}",
-                      lint.as_str(), tcx.map.node_to_string(*id), *msg)
+                      lint.to_string(), tcx.map.node_to_string(*id), *msg)
         }
     }
 
@@ -1252,7 +1252,7 @@ pub fn check_ast_crate(sess: &Session, krate: &ast::Crate) {
     // in the iteration code.
     for (_, v) in sess.lints.borrow().iter() {
         for &(lint, span, ref msg) in v {
-            span_bug!(span, "unprocessed lint {}: {}", lint.as_str(), *msg)
+            span_bug!(span, "unprocessed lint {}: {}", lint.to_string(), *msg)
         }
     }
 }
diff --git a/src/librustc/lint/mod.rs b/src/librustc/lint/mod.rs
@@ -263,7 +263,7 @@ impl LintId {
     }
 
     /// Get the name of the lint.
-    pub fn as_str(&self) -> String {
+    pub fn to_string(&self) -> String {
         self.lint.name_lower()
     }
 }

diff --git a/src/librustc/middle/region.rs b/src/librustc/middle/region.rs
@@ -237,7 +237,7 @@ impl CodeExtent {
                         // (This is the special case aluded to in the
                         // doc-comment for this method)
                         let stmt_span = blk.stmts[r.first_statement_index as usize].span;
-                        Some(Span { lo: stmt_span.hi, ..blk.span })
+                        Some(Span { lo: stmt_span.hi, hi: blk.span.hi, expn_id: stmt_span.expn_id })
                     }
                 }
             }

diff --git a/src/librustc/session/config.rs b/src/librustc/session/config.rs
@@ -891,6 +891,8 @@ options! {DebuggingOptions, DebuggingSetter, basic_debugging_options,
           "force overflow checks on or off"),
     trace_macros: bool = (false, parse_bool, [UNTRACKED],
           "for every macro invocation, print its name and arguments"),
+    debug_macros: bool = (false, parse_bool, [TRACKED],
+          "emit line numbers debug info inside macros"),
     enable_nonzeroing_move_hints: bool = (false, parse_bool, [TRACKED],
           "force nonzeroing move optimization on"),
     keep_hygiene_data: bool = (false, parse_bool, [UNTRACKED],

diff --git a/src/librustc_driver/lib.rs b/src/librustc_driver/lib.rs
@@ -861,7 +861,7 @@ Available lint options:
         for (name, to) in lints {
             let name = name.to_lowercase().replace("_", "-");
             let desc = to.into_iter()
-                         .map(|x| x.as_str().replace("_", "-"))
+                         .map(|x| x.to_string().replace("_", "-"))
                          .collect::<Vec<String>>()
                          .join(", ");
             println!("    {}  {}", padded(&name[..]), desc);

diff --git a/src/librustc_llvm/ffi.rs b/src/librustc_llvm/ffi.rs
@@ -1796,6 +1796,11 @@ extern {
                                                Col: c_uint)
                                                -> DILexicalBlock;
 
+    pub fn LLVMRustDIBuilderCreateLexicalBlockFile(Builder: DIBuilderRef,
+                                                   Scope: DIScope,
+                                                   File: DIFile)
+                                                   -> DILexicalBlock;
+
     pub fn LLVMRustDIBuilderCreateStaticVariable(Builder: DIBuilderRef,
                                                  Context: DIScope,
                                                  Name: *const c_char,