From 535914eaf8ecc3a823ba7a7dc6861d9def6cef39 Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 22 May 2023 16:42:45 +0100 Subject: [PATCH] Forbid whitespace before XML prolog --- src/reader/parser.rs | 15 +++++---- src/reader/parser/outside_tag.rs | 52 +++++++++++++++++++++++++++++++- tests/event_reader.rs | 2 +- tests/oasis.fail.txt | 2 -- tests/sun-not-wf.fail.txt | 1 - tests/xmltest.fail.txt | 1 - 6 files changed, 61 insertions(+), 12 deletions(-) diff --git a/src/reader/parser.rs b/src/reader/parser.rs index a6a7373c..ff163cb4 100644 --- a/src/reader/parser.rs +++ b/src/reader/parser.rs @@ -90,10 +90,11 @@ pub(crate) struct PullParser { #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] enum Encountered { None = 0, - Declaration = 1, - Comment = 2, - Doctype = 3, - Element = 4, + AnyChars, // whitespace before self.inside_processing_instruction(t, s), State::InsideDoctype(s) => self.inside_doctype(t, s), State::InsideDeclaration(s) => self.inside_declaration(t, s), + State::DocumentStart => self.document_start(t), } } diff --git a/src/reader/parser/outside_tag.rs b/src/reader/parser/outside_tag.rs index fc106c71..bbd28254 100644 --- a/src/reader/parser/outside_tag.rs +++ b/src/reader/parser/outside_tag.rs @@ -128,7 +128,6 @@ impl PullParser { Token::ProcessingInstructionStart => self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event), - Token::CDataStart if self.depth() > 0 => { self.into_state(State::InsideCData, next_event) }, @@ -138,4 +137,55 @@ impl PullParser { } } } + + pub fn document_start(&mut self, t: Token) -> Option { + debug_assert!(self.encountered < Encountered::Declaration); + + match t { + Token::Character(c) => { + let next_event = self.set_encountered(Encountered::AnyChars); + + if !is_whitespace_char(c) { + return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t))); + } + self.inside_whitespace = true; + + // skip whitespace outside of the root element + if (self.config.c.trim_whitespace && self.buf.is_empty()) || + (self.depth() == 0 && self.config.c.ignore_root_level_whitespace) { + return self.into_state(State::OutsideTag, next_event); + } + + self.push_pos(); + self.buf.push(c); + self.into_state(State::OutsideTag, next_event) + }, + + Token::CommentStart => { + let next_event = self.set_encountered(Encountered::Comment); + self.into_state(State::InsideComment, next_event) + } + + Token::OpeningTagStart => { + let next_event = self.set_encountered(Encountered::Element); + self.nst.push_empty(); + self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event) + }, + + Token::DoctypeStart => { + let next_event = self.set_encountered(Encountered::Doctype); + // We don't have a doctype event so skip this position + // FIXME: update when we have a doctype event + self.next_pos(); + self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event) + }, + + Token::ProcessingInstructionStart => { + self.push_pos(); + self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName)) + }, + + _ => Some(self.error(SyntaxError::UnexpectedToken(t))), + } + } } diff --git a/tests/event_reader.rs b/tests/event_reader.rs index f6102a43..122c6cd5 100644 --- a/tests/event_reader.rs +++ b/tests/event_reader.rs @@ -285,7 +285,7 @@ fn tabs_1() { test( b"\t\t", br#" - |1:2 StartDocument(1.0, UTF-8) + |1:1 StartDocument(1.0, UTF-8) |1:2 StartElement(a) |1:6 StartElement(b) |1:6 EndElement(b) diff --git a/tests/oasis.fail.txt b/tests/oasis.fail.txt index 2a2a9b6e..ef7ccaa8 100644 --- a/tests/oasis.fail.txt +++ b/tests/oasis.fail.txt @@ -1,6 +1,5 @@ o-p04pass1 p04pass1.xml names with all valid ASCII characters, and one from each other class in NameChar ; 5:8 Element A.-:̀· prefix is unbound o-p05pass1 p05pass1.xml various valid Name constructions ; 2:8 Element A:._-0 prefix is unbound -o-p01fail1 p01fail1.xml S cannot occur before the prolog o-p09fail1 p09fail1.xml EntityValue excludes '%' o-p09fail2 p09fail2.xml EntityValue excludes '&' o-p12fail1 p12fail1.xml '"' excluded @@ -10,7 +9,6 @@ o-p12fail4 p12fail4.xml '>' excluded o-p12fail5 p12fail5.xml '<' excluded o-p12fail6 p12fail6.xml built-in entity refs excluded o-p12fail7 p12fail7.xml The public ID has a tab character, which is disallowed -o-p22fail1 p22fail1.xml prolog must start with XML decl o-p30fail1 p30fail1.xml An XML declaration is not the same as a TextDecl o-p31fail1 p31fail1.xml external subset excludes doctypedecl o-p32fail3 p32fail3.xml initial S is required diff --git a/tests/sun-not-wf.fail.txt b/tests/sun-not-wf.fail.txt index a8c26c18..e06a5b96 100644 --- a/tests/sun-not-wf.fail.txt +++ b/tests/sun-not-wf.fail.txt @@ -27,7 +27,6 @@ pubid02 pubid02.xml Illegal characters in public ID pubid03 pubid03.xml Illegal characters in public ID pubid04 pubid04.xml Illegal characters in public ID pubid05 pubid05.xml SGML-ism: public ID without system ID -sgml02 sgml02.xml XML declaration must be at the very beginning of a document; it"s not a processing instruction sgml04 sgml04.xml ATTLIST declarations apply to only one element, unlike SGML sgml05 sgml05.xml ELEMENT declarations apply to only one element, unlike SGML sgml06 sgml06.xml ATTLIST declarations are never global, unlike in SGML diff --git a/tests/xmltest.fail.txt b/tests/xmltest.fail.txt index 39a773bb..f09412e9 100644 --- a/tests/xmltest.fail.txt +++ b/tests/xmltest.fail.txt @@ -51,7 +51,6 @@ not-wf-sa-136 136.xml Tag omission is invalid in XML. not-wf-sa-137 137.xml Space is required before a content model. not-wf-sa-138 138.xml Invalid syntax for content particle. not-wf-sa-139 139.xml The element-content model should not be empty. -not-wf-sa-147 147.xml XML Declaration may not be preceded by whitespace. not-wf-sa-149 149.xml XML Declaration may not be within a DTD. not-wf-sa-158 158.xml SGML-ism: "#NOTATION gif" can't have attributes. not-wf-sa-159 159.xml Uses '&' unquoted in an entity declaration, which is illegal syntax for an entity reference.