From 3ebe221838f48252a3a74dc4c7b2dfa7ae0a2d61 Mon Sep 17 00:00:00 2001 From: Yoonchae Lee Date: Thu, 19 Sep 2024 08:41:52 +0500 Subject: [PATCH] Fix incorrect DocType closing bracket detection when parsing with buffered reader --- Changelog.md | 4 ++++ src/reader/buffered_reader.rs | 2 +- src/reader/mod.rs | 31 +++++++++++++++--------------- src/reader/slice_reader.rs | 2 +- src/reader/state.rs | 2 +- tests/issues.rs | 36 +++++++++++++++++++++++++++++++++++ tests/serde-issues.rs | 14 +++++++++++++- 7 files changed, 72 insertions(+), 19 deletions(-) diff --git a/Changelog.md b/Changelog.md index 9ca10e6f..3a16a166 100644 --- a/Changelog.md +++ b/Changelog.md @@ -17,8 +17,12 @@ ### Bug Fixes +- [#533]: Fix incorrect DocType closing bracket detection when parsing with buffered reader + ### Misc Changes +[#623]: /~https://github.com/tafia/quick-xml/issues/533 + ## 0.36.1 -- 2024-07-23 diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index 95ac1dc7..6446ed8a 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -157,7 +157,7 @@ macro_rules! impl_buffered_source { buf.push(b'!'); self $(.$reader)? .consume(1); - let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?; + let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?; loop { match self $(.$reader)? .fill_buf() $(.$await)? { diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 9a81e90e..fb63b2da 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -1019,8 +1019,8 @@ enum BangType { CData, /// Comment, - /// - DocType, + /// . Contains balance of '<' (+1) and '>' (-1) + DocType(i32), } impl BangType { #[inline(always)] @@ -1028,7 +1028,7 @@ impl BangType { Ok(match byte { Some(b'[') => Self::CData, Some(b'-') => Self::Comment, - Some(b'D') | Some(b'd') => Self::DocType, + Some(b'D') | Some(b'd') => Self::DocType(0), _ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)), }) } @@ -1040,7 +1040,7 @@ impl BangType { /// - `buf`: buffer with data consumed on previous iterations /// - `chunk`: data read on current iteration and not yet consumed from reader #[inline(always)] - fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { + fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { match self { Self::Comment => { for i in memchr::memchr_iter(b'>', chunk) { @@ -1085,14 +1085,15 @@ impl BangType { } } } - Self::DocType => { - for i in memchr::memchr_iter(b'>', chunk) { - let content = &chunk[..i]; - let balance = memchr::memchr2_iter(b'<', b'>', content) - .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) - .sum::(); - if balance == 0 { - return Some((content, i + 1)); // +1 for `>` + Self::DocType(ref mut balance) => { + for i in memchr::memchr2_iter(b'<', b'>', chunk) { + if chunk[i] == b'<' { + *balance += 1; + } else { + if *balance == 0 { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + *balance -= 1; } } } @@ -1104,7 +1105,7 @@ impl BangType { match self { Self::CData => Error::Syntax(SyntaxError::UnclosedCData), Self::Comment => Error::Syntax(SyntaxError::UnclosedComment), - Self::DocType => Error::Syntax(SyntaxError::UnclosedDoctype), + Self::DocType(_) => Error::Syntax(SyntaxError::UnclosedDoctype), } } } @@ -1414,7 +1415,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!DOCTYPE")) + (BangType::DocType(0), Bytes(b"!DOCTYPE")) ); assert_eq!(position, 10); } @@ -1488,7 +1489,7 @@ mod test { .unwrap(); assert_eq!( (ty, Bytes(bytes)), - (BangType::DocType, Bytes(b"!doctype")) + (BangType::DocType(0), Bytes(b"!doctype")) ); assert_eq!(position, 10); } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index 6b4c2804..e1082cac 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -307,7 +307,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { // start with it. debug_assert_eq!(self[0], b'!'); - let bang_type = BangType::new(self[1..].first().copied())?; + let mut bang_type = BangType::new(self[1..].first().copied())?; if let Some((bytes, i)) = bang_type.parse(&[], self) { *position += i as u64; diff --git a/src/reader/state.rs b/src/reader/state.rs index 0119022d..f906c5b6 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -144,7 +144,7 @@ impl ReaderState { // https://www.w3.org/TR/xml11/#sec-prolog-dtd // HTML5 allows mixed case for doctype declarations: // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { + BangType::DocType(0) if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( // Cut of `!DOCTYPE` and any number of spaces from start diff --git a/tests/issues.rs b/tests/issues.rs index 95dd64ff..d4fdcbb0 100644 --- a/tests/issues.rs +++ b/tests/issues.rs @@ -171,6 +171,24 @@ mod issue514 { } } +/// Regression test for /~https://github.com/tafia/quick-xml/issues/590 +#[test] +fn issue590() { + let mut reader = Reader::from_reader(BufReader::with_capacity( + 16, + &b"]>"[..], + // 0 7 ^15 ^23 + //[ ] = capacity + )); + let mut buf = Vec::new(); + loop { + match reader.read_event_into(&mut buf).unwrap() { + Event::Eof => break, + _ => (), + } + } +} + /// Regression test for /~https://github.com/tafia/quick-xml/issues/604 mod issue604 { use super::*; @@ -477,3 +495,21 @@ fn issue776() { Event::End(BytesEnd::new(r#"tag attr=">""#)) ); } + +/// Regression test for /~https://github.com/tafia/quick-xml/issues/801 +/// Angle brackets are read in different buffer +#[test] +fn issue801() { + let xml = b" ]>"; + let reader = BufReader::with_capacity(2, &xml[..]); + let mut reader = Reader::from_reader(reader); + let mut buf = Vec::new(); + loop { + buf.clear(); + match reader.read_event_into(&mut buf) { + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), + Ok(Event::Eof) => break, + Ok(_) => {} + } + } +} diff --git a/tests/serde-issues.rs b/tests/serde-issues.rs index 2726e45a..9781ae72 100644 --- a/tests/serde-issues.rs +++ b/tests/serde-issues.rs @@ -3,11 +3,12 @@ //! Name each module / test as `issue` and keep sorted by issue number use pretty_assertions::assert_eq; -use quick_xml::de::from_str; +use quick_xml::de::{from_reader, from_str}; use quick_xml::se::{to_string, to_string_with_root}; use serde::de::{Deserializer, IgnoredAny}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::io::BufReader; /// Regression tests for /~https://github.com/tafia/quick-xml/issues/252. mod issue252 { @@ -321,6 +322,17 @@ fn issue510() { ); } +/// Regression test for /~https://github.com/tafia/quick-xml/issues/533. +#[test] +fn issue533() { + #[derive(Deserialize)] + struct X {} + + let xml = "]>"; + let reader = BufReader::with_capacity(20, xml.as_bytes()); + let _: X = from_reader(reader).unwrap(); +} + /// Regression test for /~https://github.com/tafia/quick-xml/issues/537. /// /// This test checks that special `xmlns:xxx` attributes uses full name of