diff --git a/src/libcore/char.rs b/src/libcore/char.rs index 98b7632a220dd..f7d4993bee3b9 100644 --- a/src/libcore/char.rs +++ b/src/libcore/char.rs @@ -752,25 +752,81 @@ pub struct InvalidSequence(()); impl> Iterator for DecodeUtf8 { type Item = Result; #[inline] + fn next(&mut self) -> Option> { - self.0.next().map(|b| { - if b & 0x80 == 0 { Ok(b as char) } else { - let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation - if l < 2 || l > 6 { return Err(InvalidSequence(())) }; - let mut x = (b as u32) & (0x7F >> l); - for _ in 0..l-1 { + self.0.next().map(|first_byte| { + // Emit InvalidSequence according to + // Unicode §5.22 Best Practice for U+FFFD Substitution + // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 + + // Roughly: consume at least one byte, + // then validate one byte at a time and stop before the first unexpected byte + // (which might be the valid start of the next byte sequence). + + let mut code_point; + macro_rules! first_byte { + ($mask: expr) => { + code_point = u32::from(first_byte & $mask) + } + } + macro_rules! continuation_byte { + () => { continuation_byte!(0x80...0xBF) }; + ($range: pat) => { match self.0.peek() { - Some(&b) if b & 0xC0 == 0x80 => { + Some(&byte @ $range) => { + code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); self.0.next(); - x = (x << 6) | (b as u32) & 0x3F; - }, - _ => return Err(InvalidSequence(())), + } + _ => return Err(InvalidSequence(())) } } - match from_u32(x) { - Some(x) if l == x.len_utf8() => Ok(x), - _ => Err(InvalidSequence(())), + } + + match first_byte { + 0x00...0x7F => { + first_byte!(0b1111_1111); + } + 0xC2...0xDF => { + first_byte!(0b0001_1111); + continuation_byte!(); + } + 0xE0 => { + first_byte!(0b0000_1111); + continuation_byte!(0xA0...0xBF); // 0x80...0x9F here are overlong + continuation_byte!(); } + 0xE1...0xEC | 0xEE...0xEF => { + first_byte!(0b0000_1111); + continuation_byte!(); + continuation_byte!(); + } + 0xED => { + first_byte!(0b0000_1111); + continuation_byte!(0x80...0x9F); // 0xA0..0xBF here are surrogates + continuation_byte!(); + } + 0xF0 => { + first_byte!(0b0000_0111); + continuation_byte!(0x90...0xBF); // 0x80..0x8F here are overlong + continuation_byte!(); + continuation_byte!(); + } + 0xF1...0xF3 => { + first_byte!(0b0000_0111); + continuation_byte!(); + continuation_byte!(); + continuation_byte!(); + } + 0xF4 => { + first_byte!(0b0000_0111); + continuation_byte!(0x80...0x8F); // 0x90..0xBF here are beyond char::MAX + continuation_byte!(); + continuation_byte!(); + } + _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX + } + unsafe { + Ok(from_u32_unchecked(code_point)) } }) } diff --git a/src/libcoretest/char.rs b/src/libcoretest/char.rs index 4632419336d7f..333503d738943 100644 --- a/src/libcoretest/char.rs +++ b/src/libcoretest/char.rs @@ -358,29 +358,50 @@ fn eu_iterator_specializations() { #[test] fn test_decode_utf8() { - use core::char::*; - use core::iter::FromIterator; - - for &(str, bs) in [("", &[] as &[u8]), - ("A", &[0x41u8] as &[u8]), - ("�", &[0xC1u8, 0x81u8] as &[u8]), - ("♥", &[0xE2u8, 0x99u8, 0xA5u8]), - ("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]), - ("�", &[0xE2u8, 0x99u8] as &[u8]), - ("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]), - ("�", &[0xC0u8] as &[u8]), - ("�A", &[0xC0u8, 0x41u8] as &[u8]), - ("�", &[0x80u8] as &[u8]), - ("�A", &[0x80u8, 0x41u8] as &[u8]), - ("�", &[0xFEu8] as &[u8]), - ("�A", &[0xFEu8, 0x41u8] as &[u8]), - ("�", &[0xFFu8] as &[u8]), - ("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() { - assert!(Iterator::eq(str.chars(), - decode_utf8(bs.into_iter().map(|&b|b)) - .map(|r_b| r_b.unwrap_or('\u{FFFD}'))), - "chars = {}, bytes = {:?}, decoded = {:?}", str, bs, - Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b)) - .map(|r_b| r_b.unwrap_or('\u{FFFD}')))); + macro_rules! assert_decode_utf8 { + ($input_bytes: expr, $expected_str: expr) => { + let input_bytes: &[u8] = &$input_bytes; + let s = char::decode_utf8(input_bytes.iter().cloned()) + .map(|r_b| r_b.unwrap_or('\u{FFFD}')) + .collect::(); + assert_eq!(s, $expected_str, + "input bytes: {:?}, expected str: {:?}, result: {:?}", + input_bytes, $expected_str, s); + assert_eq!(String::from_utf8_lossy(&$input_bytes), $expected_str); + } } + + assert_decode_utf8!([], ""); + assert_decode_utf8!([0x41], "A"); + assert_decode_utf8!([0xC1, 0x81], "��"); + assert_decode_utf8!([0xE2, 0x99, 0xA5], "♥"); + assert_decode_utf8!([0xE2, 0x99, 0xA5, 0x41], "♥A"); + assert_decode_utf8!([0xE2, 0x99], "�"); + assert_decode_utf8!([0xE2, 0x99, 0x41], "�A"); + assert_decode_utf8!([0xC0], "�"); + assert_decode_utf8!([0xC0, 0x41], "�A"); + assert_decode_utf8!([0x80], "�"); + assert_decode_utf8!([0x80, 0x41], "�A"); + assert_decode_utf8!([0xFE], "�"); + assert_decode_utf8!([0xFE, 0x41], "�A"); + assert_decode_utf8!([0xFF], "�"); + assert_decode_utf8!([0xFF, 0x41], "�A"); + assert_decode_utf8!([0xC0, 0x80], "��"); + + // Surrogates + assert_decode_utf8!([0xED, 0x9F, 0xBF], "\u{D7FF}"); + assert_decode_utf8!([0xED, 0xA0, 0x80], "���"); + assert_decode_utf8!([0xED, 0xBF, 0x80], "���"); + assert_decode_utf8!([0xEE, 0x80, 0x80], "\u{E000}"); + + // char::MAX + assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0xBF], "\u{10FFFF}"); + assert_decode_utf8!([0xF4, 0x8F, 0xBF, 0x41], "�A"); + assert_decode_utf8!([0xF4, 0x90, 0x80, 0x80], "����"); + + // 5 and 6 bytes sequence + // Part of the original design of UTF-8, + // but invalid now that UTF-8 is artificially restricted to match the range of UTF-16. + assert_decode_utf8!([0xF8, 0x80, 0x80, 0x80, 0x80], "�����"); + assert_decode_utf8!([0xFC, 0x80, 0x80, 0x80, 0x80, 0x80], "������"); }