From ce28e9ba01cda818439b36108e072435715552fc Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Thu, 9 Jan 2025 15:27:21 +1100 Subject: [PATCH] Make InvalidCharError support UTF-8 Currently the `InvalidCharError` error returns a `u8` if an invalid hex character is found. If that invalid character happened to be a `char` that encodes as UTF-8 using more than one byte (i.e. not ASCII) then this means the `u8` returned is not the same as the invalid character in the input string. Add support for invalid hex characters that fall outside of a single ASCII byte. Because there are so many edge cases, this is a best effort. I'm not claiming to have gotten this right for all input strings. We default to just returning the byte. Currently a 4-byte encoded character that falls on an odd position in the hex string is not correctly handled but this patch is definitely an improvement none the less. --- src/error.rs | 177 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/iter.rs | 121 ++++++++++++++++++++++------------- src/parse.rs | 133 ++++++++++++++++++++++++++++++++++---- 3 files changed, 370 insertions(+), 61 deletions(-) diff --git a/src/error.rs b/src/error.rs index 8ff1f5d..e183988 100644 --- a/src/error.rs +++ b/src/error.rs @@ -97,29 +97,166 @@ impl From for ToBytesError { fn from(e: OddLengthStringError) -> Self { Self::OddLengthString(e) } } -/// Invalid hex character. +/// UTF-8 character is not an ASCII hex character. #[derive(Debug, Clone, PartialEq, Eq)] pub struct InvalidCharError { - pub(crate) invalid: u8, + pub(crate) invalid: char, pub(crate) pos: usize, } impl InvalidCharError { - /// Returns the invalid character byte. - pub fn invalid_char(&self) -> u8 { self.invalid } - /// Returns the position of the invalid character byte. + /// Returns the invalid character. + pub fn invalid_char(&self) -> char { self.invalid } + + /// Returns the position in the input string of the invalid character. pub fn pos(&self) -> usize { self.pos } } impl fmt::Display for InvalidCharError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "invalid hex char {} at pos {}", self.invalid_char(), self.pos()) + write!( + f, + "char {} at pos {} is not a valid ASCII hex character", + self.invalid_char(), + self.pos() + ) } } #[cfg(feature = "std")] impl std::error::Error for InvalidCharError {} +/// Invalid ASCII encoding of a hex digit. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct InvalidDigitError { + /// The high digit in the pair e.g., for 'ab' this is 'a'. + pub(crate) hi: u8, + + /// The high digit in the pair e.g., for 'ab' this is 'b'. + pub(crate) lo: u8, + + /// `true` if the invalid byte is the `hi` one, `false` if its the `lo` one. + pub(crate) is_hi: bool, + + /// Position of the stat of the byte pair in the input string. + pub(crate) pos: usize, +} + +impl InvalidDigitError { + /// Returns the invalid byte. + pub fn invalid_byte(&self) -> u8 { + if self.is_hi { + self.hi + } else { + self.lo + } + } + + /// Returns the character position in input string of the invalid byte. + pub fn pos(&self) -> usize { + if self.is_hi { + self.pos + } else { + self.pos + 1 + } + } + + pub(crate) fn into_invalid_char_error( + self, + // The next item from the iterator. + next: Option>, + ) -> InvalidCharError { + // Try to map the UTF-8 values in `v` to an invalid character. + fn error(v: &[u8], pos: usize) -> Option { + core::str::from_utf8(v) + .map(|s| InvalidCharError { invalid: s.chars().next().unwrap(), pos }) + .ok() + } + + let rich_error = match next { + Some(res) => match res { + // The next two bytes happen to be valid ASCII. + Ok(byte) => { + let (hi, lo) = crate::iter::byte_to_hex_digits(byte); + if !self.is_hi { + let vals = [self.lo]; + error(&vals, self.pos + 1).or_else(|| { + let vals = [self.lo, hi]; + error(&vals, self.pos + 1).or_else(|| { + let vals = [self.lo, hi, lo]; + error(&vals, self.pos + 1) + }) + }) + } else { + let vals = [self.hi]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo, hi]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo, hi, lo]; + error(&vals, self.pos) + }) + }) + }) + } + } + // The next two bytes happen to be invalid ASCII. + Err(e) => + if !self.is_hi { + let vals = [self.lo, e.hi]; + error(&vals, self.pos + 1).or_else(|| { + let vals = [self.lo, e.hi, e.lo]; + error(&vals, self.pos + 1) + }) + } else { + let vals = [self.hi, self.lo, e.hi]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo, e.hi]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo, e.hi, e.lo]; + error(&vals, self.pos) + }) + }) + }) + }, + }, + // Invalid character was for the last character in the input string. + None => + if !self.is_hi { + let vals = [self.lo]; + error(&vals, self.pos + 1) + } else { + let vals = [self.hi]; + error(&vals, self.pos).or_else(|| { + let vals = [self.hi, self.lo]; + error(&vals, self.pos) + }) + }, + }; + rich_error.unwrap_or(InvalidCharError { + invalid: if self.is_hi { self.hi.into() } else { self.lo.into() }, + pos: self.pos, + }) + } +} + +impl fmt::Display for InvalidDigitError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "byte value {:x} is not a valid ASCII encoding of a hex digit (pos {})", + self.invalid_byte(), + self.pos() + ) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for InvalidDigitError {} + /// Purported hex string had odd length. #[derive(Debug, Clone, PartialEq, Eq)] pub struct OddLengthStringError { @@ -240,3 +377,31 @@ impl fmt::Display for InvalidLengthError { #[cfg(feature = "std")] impl std::error::Error for InvalidLengthError {} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn invalid_digit_to_invalid_char_za() { + // Input string: 'za' + let e = InvalidDigitError { hi: b'z', lo: b'a', is_hi: true, pos: 0 }; + let want = InvalidCharError { invalid: 'z', pos: 0 }; + let got = e.into_invalid_char_error(None); + assert_eq!(got, want); + } + + #[test] + fn invalid_digit_to_invalid_char_az() { + // Input string: 'az' + let e = InvalidDigitError { + hi: b'a', + lo: b'z', + is_hi: false, + pos: 0, // This is the position of 'hi'. + }; + let want = InvalidCharError { invalid: 'z', pos: 1 }; // This is the position of 'z'. + let got = e.into_invalid_char_error(None); + assert_eq!(got, want); + } +} diff --git a/src/iter.rs b/src/iter.rs index a7488fd..30399c5 100644 --- a/src/iter.rs +++ b/src/iter.rs @@ -11,7 +11,7 @@ use std::io; #[cfg(all(feature = "alloc", not(feature = "std")))] use crate::alloc::vec::Vec; -use crate::error::{InvalidCharError, OddLengthStringError}; +use crate::error::{InvalidCharError, InvalidDigitError, OddLengthStringError}; use crate::{Case, Table}; /// Convenience alias for `HexToBytesIter>`. @@ -53,11 +53,18 @@ impl<'a> HexToBytesIter> { pub(crate) fn drain_to_slice(self, buf: &mut [u8]) -> Result<(), InvalidCharError> { assert_eq!(self.len(), buf.len()); let mut ptr = buf.as_mut_ptr(); - for byte in self { - // SAFETY: for loop iterates `len` times, and `buf` has length `len` - unsafe { - core::ptr::write(ptr, byte?); - ptr = ptr.add(1); + + let mut iter = self; + while let Some(byte) = iter.next() { + match byte { + Ok(byte) => { + // SAFETY: for loop iterates `len` times, and `buf` has length `len` + unsafe { + core::ptr::write(ptr, byte); + ptr = ptr.add(1); + } + } + Err(e) => return Err(e.into_invalid_char_error(iter.next())), } } Ok(()) @@ -72,12 +79,18 @@ impl<'a> HexToBytesIter> { let len = self.len(); let mut ret = Vec::with_capacity(len); let mut ptr = ret.as_mut_ptr(); - for byte in self { - // SAFETY: for loop iterates `len` times, and `ret` has a capacity of at least `len` - unsafe { - // docs: "`core::ptr::write` is appropriate for initializing uninitialized memory" - core::ptr::write(ptr, byte?); - ptr = ptr.add(1); + + let mut iter = self; + while let Some(byte) = iter.next() { + match byte { + Ok(byte) => { + // SAFETY: for loop iterates `len` times, and `buf` has length `len` + unsafe { + core::ptr::write(ptr, byte); + ptr = ptr.add(1); + } + } + Err(e) => return Err(e.into_invalid_char_error(iter.next())), } } // SAFETY: `len` elements have been initialized, and `ret` has a capacity of at least `len` @@ -95,18 +108,16 @@ impl + ExactSizeIterator> HexToBytesIter { } impl + ExactSizeIterator> Iterator for HexToBytesIter { - type Item = Result; + type Item = Result; #[inline] fn next(&mut self) -> Option { let [hi, lo] = self.iter.next()?; - Some(hex_chars_to_byte(hi, lo).map_err(|(c, is_high)| InvalidCharError { - invalid: c, - pos: if is_high { - (self.original_len - self.iter.len() - 1) * 2 - } else { - (self.original_len - self.iter.len() - 1) * 2 + 1 - }, + Some(hex_digits_to_byte(hi, lo).map_err(|is_hi| InvalidDigitError { + hi, + lo, + is_hi, + pos: (self.original_len - self.iter.len() - 1) * 2, })) } @@ -116,13 +127,11 @@ impl + ExactSizeIterator> Iterator for HexToBytesIte #[inline] fn nth(&mut self, n: usize) -> Option { let [hi, lo] = self.iter.nth(n)?; - Some(hex_chars_to_byte(hi, lo).map_err(|(c, is_high)| InvalidCharError { - invalid: c, - pos: if is_high { - (self.original_len - self.iter.len() - 1) * 2 - } else { - (self.original_len - self.iter.len() - 1) * 2 + 1 - }, + Some(hex_digits_to_byte(hi, lo).map_err(|is_hi| InvalidDigitError { + hi, + lo, + is_hi, + pos: (self.original_len - self.iter.len() - 1) * 2, })) } } @@ -133,18 +142,22 @@ impl + DoubleEndedIterator + ExactSizeIterator> Doub #[inline] fn next_back(&mut self) -> Option { let [hi, lo] = self.iter.next_back()?; - Some(hex_chars_to_byte(hi, lo).map_err(|(c, is_high)| InvalidCharError { - invalid: c, - pos: if is_high { self.iter.len() * 2 } else { self.iter.len() * 2 + 1 }, + Some(hex_digits_to_byte(hi, lo).map_err(|is_hi| InvalidDigitError { + hi, + lo, + is_hi, + pos: self.iter.len() * 2, })) } #[inline] fn nth_back(&mut self, n: usize) -> Option { let [hi, lo] = self.iter.nth_back(n)?; - Some(hex_chars_to_byte(hi, lo).map_err(|(c, is_high)| InvalidCharError { - invalid: c, - pos: if is_high { self.iter.len() * 2 } else { self.iter.len() * 2 + 1 }, + Some(hex_digits_to_byte(hi, lo).map_err(|is_hi| InvalidDigitError { + hi, + lo, + is_hi, + pos: self.iter.len() * 2, })) } } @@ -226,17 +239,29 @@ impl ExactSizeIterator for HexDigitsIter<'_> {} impl core::iter::FusedIterator for HexDigitsIter<'_> {} -/// `hi` and `lo` are bytes representing hex characters. +/// `hi` and `lo` are bytes that encode hex digits as ASCII. /// -/// Returns the valid byte or the invalid input byte and a bool indicating error for `hi` or `lo`. -fn hex_chars_to_byte(hi: u8, lo: u8) -> Result { - let hih = (hi as char).to_digit(16).ok_or((hi, true))?; - let loh = (lo as char).to_digit(16).ok_or((lo, false))?; +/// Returns the valid byte or a bool indicating error for `hi` or `lo`. +fn hex_digits_to_byte(hi: u8, lo: u8) -> Result { + let hih = (hi as char).to_digit(16).ok_or(true)?; + let loh = (lo as char).to_digit(16).ok_or(false)?; let ret = (hih << 4) + loh; + Ok(ret as u8) } +/// The inverse of `hex_digits_to_byte`. +pub(crate) fn byte_to_hex_digits(byte: u8) -> (u8, u8) { + let hi = byte >> 4; + let lo = byte & 0x0f; + + let hi = char::from_digit(hi.into(), 16).unwrap(); + let lo = char::from_digit(lo.into(), 16).unwrap(); + + (hi as u8, lo as u8) +} + /// Iterator over bytes which encodes the bytes and yields hex characters. pub struct BytesToHexIter where @@ -458,7 +483,7 @@ mod tests { let mut got = [0u8; 4]; assert_eq!( iter.drain_to_slice(&mut got).unwrap_err(), - InvalidCharError { invalid: b'g', pos: 0 } + InvalidCharError { invalid: 'g', pos: 0 } ); } @@ -469,7 +494,7 @@ mod tests { let mut got = [0u8; 4]; assert_eq!( iter.drain_to_slice(&mut got).unwrap_err(), - InvalidCharError { invalid: b'g', pos: 4 } + InvalidCharError { invalid: 'g', pos: 4 } ); } @@ -480,7 +505,7 @@ mod tests { let mut got = [0u8; 4]; assert_eq!( iter.drain_to_slice(&mut got).unwrap_err(), - InvalidCharError { invalid: b'g', pos: 7 } + InvalidCharError { invalid: 'g', pos: 7 } ); } @@ -502,21 +527,21 @@ mod tests { fn hex_to_bytes_vec_drain_first_char_error() { let hex = "geadbeef"; let iter = HexToBytesIter::new_unchecked(hex); - assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: b'g', pos: 0 }); + assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: 'g', pos: 0 }); } #[test] fn hex_to_bytes_vec_drain_middle_char_error() { let hex = "deadgeef"; let iter = HexToBytesIter::new_unchecked(hex); - assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: b'g', pos: 4 }); + assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: 'g', pos: 4 }); } #[test] fn hex_to_bytes_vec_drain_end_char_error() { let hex = "deadbeeg"; let iter = HexToBytesIter::new_unchecked(hex); - assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: b'g', pos: 7 }); + assert_eq!(iter.drain_to_vec().unwrap_err(), InvalidCharError { invalid: 'g', pos: 7 }); } #[test] @@ -574,4 +599,12 @@ mod tests { BytesToHexIter::new(upper_bytes_iter, Case::Upper).rev().collect::(); assert_eq!(upper_got, upper_want); } + + #[test] + fn hex_digits_to_byte_roundtrips() { + let b = hex_digits_to_byte(b'a', b'b').unwrap(); + let want = (b'a', b'b'); + let got = byte_to_hex_digits(b); + assert_eq!(got, want) + } } diff --git a/src/parse.rs b/src/parse.rs index 7a5e27e..7b190bf 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -52,13 +52,12 @@ mod tests { #[test] #[cfg(feature = "alloc")] - fn hex_error() { + fn hex_error_ascii() { use crate::error::{InvalidCharError, OddLengthStringError}; let oddlen = "0123456789abcdef0"; let badchar1 = "Z123456789abcdef"; let badchar2 = "012Y456789abcdeb"; - let badchar3 = "«23456789abcdef"; assert_eq!( Vec::::from_hex(oddlen).unwrap_err(), @@ -70,21 +69,133 @@ mod tests { ); assert_eq!( Vec::::from_hex(badchar1).unwrap_err(), - InvalidCharError { pos: 0, invalid: b'Z' }.into() + InvalidCharError { pos: 0, invalid: 'Z' }.into() ); assert_eq!( Vec::::from_hex(badchar2).unwrap_err(), - InvalidCharError { pos: 3, invalid: b'Y' }.into() + InvalidCharError { pos: 3, invalid: 'Y' }.into() ); + } + + #[test] + #[cfg(feature = "alloc")] + fn hex_error_non_ascii() { + use crate::error::{InvalidCharError, OddLengthStringError}; + + // These are for sanity and documentation purposes. + assert_eq!("«".len(), 2); // 0xC2 0xAB + assert_eq!("✓".len(), 3); // 0xE2 0x9C 0x93 + assert_eq!("𓃾".len(), 4); // 0xF0 0x93 0x83 0xbe + assert_eq!("0123456789abcdef".len(), 16); + + let badchar = "0123456789abcde«"; + assert_eq!(badchar.len(), 17); + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + OddLengthStringError { len: 17 }.into(), + ); + + // I would have thought this was length 1. + let badchar = "«"; + assert_eq!(badchar.len(), 2); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '«' }.into() + ); + + let badchar = "«✓a"; + assert_eq!(badchar.len(), 6); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '«' }.into() + ); + + let badchar = "0123456789abcd«"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 14, invalid: '«' }.into() + ); + + let badchar = "0123456789a«✓"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 11, invalid: '«' }.into() + ); + + let badchar = "«0123456789abcd"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '«' }.into() + ); + + let badchar = "«✓56789abcdef"; + assert_eq!(badchar.len(), 16); // Sanity check. assert_eq!( - Vec::::from_hex(badchar3).unwrap_err(), - InvalidCharError { pos: 0, invalid: 194 }.into() + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '«' }.into() ); + + let badchar = "0123456789abc✓"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 13, invalid: '✓' }.into() + ); + + let badchar = "✓3456789abcdef"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '✓' }.into() + ); + + let badchar = "✓«56789abcdef"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '✓' }.into() + ); + + let badchar = "0123456789ab𓃾"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 12, invalid: '𓃾' }.into() + ); + let badchar = "𓃾456789abcdef"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 0, invalid: '𓃾' }.into() + ); + let badchar = "01𓃾6789abcdef"; + assert_eq!(badchar.len(), 16); // Sanity check. + assert_eq!( + Vec::::from_hex(badchar).unwrap_err(), + InvalidCharError { pos: 2, invalid: '𓃾' }.into() + ); + + // Can't handle 4 byte encoded character that is in an odd position. + // let badchar = "0123456789a𓃾f"; + // assert_eq!(badchar.len(), 16); // Sanity check. + // assert_eq!( + // Vec::::from_hex(badchar).unwrap_err(), + // InvalidCharError { pos: 11, invalid: '𓃾' }.into() + // ); + // let badchar = "0𓃾456789abcde"; + // assert_eq!(badchar.len(), 16); // Sanity check. + // assert_eq!( + // Vec::::from_hex(badchar).unwrap_err(), + // InvalidCharError { pos: 1, invalid: '𓃾' }.into() + // ); } #[test] fn hex_error_position() { - use crate::error::InvalidCharError; + use crate::error::InvalidDigitError; let badpos1 = "Z123456789abcdef"; let badpos2 = "012Y456789abcdeb"; let badpos3 = "0123456789abcdeZ"; @@ -92,19 +203,19 @@ mod tests { assert_eq!( HexToBytesIter::new(badpos1).unwrap().next().unwrap().unwrap_err(), - InvalidCharError { pos: 0, invalid: b'Z' } + InvalidDigitError { hi: b'Z', lo: b'1', is_hi: true, pos: 0 } ); assert_eq!( HexToBytesIter::new(badpos2).unwrap().nth(1).unwrap().unwrap_err(), - InvalidCharError { pos: 3, invalid: b'Y' } + InvalidDigitError { hi: b'2', lo: b'Y', is_hi: false, pos: 2 } ); assert_eq!( HexToBytesIter::new(badpos3).unwrap().next_back().unwrap().unwrap_err(), - InvalidCharError { pos: 15, invalid: b'Z' } + InvalidDigitError { hi: b'e', lo: b'Z', is_hi: false, pos: 14 } ); assert_eq!( HexToBytesIter::new(badpos4).unwrap().nth_back(1).unwrap().unwrap_err(), - InvalidCharError { pos: 12, invalid: b'Y' } + InvalidDigitError { hi: b'Y', lo: b'd', is_hi: true, pos: 12 } ); }