Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix processing instruction parsing #753

Merged
merged 2 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ resolve predefined entities.
it can handle every attribute that does not match existing cases within an enum variant.
- [#722]: Allow to pass owned strings to `Writer::create_element`. This is breaking change!
- [#275]: Added `ElementWriter::new_line()` which enables pretty printing elements with multiple attributes.
- [#743]: Add `Deserializer::get_ref()` to get XML Reader from serde Deserializer
- [#734]: Add helper functions to resolve predefined XML and HTML5 entities:
- [#743]: Added `Deserializer::get_ref()` to get XML Reader from serde Deserializer
- [#734]: Added helper functions to resolve predefined XML and HTML5 entities:
- `quick_xml::escape::resolve_predefined_entity`
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.

### Bug Fixes

Expand All @@ -50,6 +51,7 @@ resolve predefined entities.
- [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`.
- [#704]: Fix empty tags with attributes not being expanded when `expand_empty_elements` is set to true.
- [#683]: Use local tag name when check tag name against possible names for field.
- [#753]: Correctly determine end of processing instructions and XML declaration.

### Misc Changes

Expand Down Expand Up @@ -98,6 +100,7 @@ resolve predefined entities.
[#738]: https://github.com/tafia/quick-xml/pull/738
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
[#753]: https://github.com/tafia/quick-xml/pull/753
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
Expand Down
43 changes: 43 additions & 0 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,49 @@ macro_rules! impl_buffered_source {
Ok((&buf[start..], done))
}

$($async)? fn read_pi $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
let mut parser = super::PiParser::default();

let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match parser.feed(available) {
Some(i) => {
// We does not include `>` in data
buf.extend_from_slice(&available[..i - 1]);
done = true;
i
}
None => {
buf.extend_from_slice(available);
available.len()
}
}
};
self $(.$reader)? .consume(used);
read += used;
}
*position += read;

Ok((&buf[start..], done))
}

$($async)? fn read_bang_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
Expand Down
31 changes: 25 additions & 6 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ macro_rules! read_until_close {
},
// `<?` - processing instruction
Ok(Some(b'?')) => match $reader
.read_bytes_until(b'>', $buf, &mut $self.state.offset)
.read_pi($buf, &mut $self.state.offset)
$(.$await)?
{
Ok((bytes, true)) => $self.state.emit_question_mark(bytes),
Expand Down Expand Up @@ -428,10 +428,12 @@ macro_rules! read_to_end {
mod async_tokio;
mod buffered_reader;
mod ns_reader;
mod pi;
mod slice_reader;
mod state;

pub use ns_reader::NsReader;
pub use pi::PiParser;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;
Expand Down Expand Up @@ -816,12 +818,29 @@ trait XmlSource<'r, B> {
position: &mut usize,
) -> Result<(&'r [u8], bool)>;

/// Read input until comment, CDATA or processing instruction is finished.
/// Read input until processing instruction is finished.
///
/// This method expect that `<?` already was read.
///
/// Returns a slice of data read up to end of processing instruction (`>`),
/// which does not include into result (`?` at the end included).
///
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
///
/// # Parameters
/// - `buf`: Buffer that could be filled from an input (`Self`) and
/// from which [events] could borrow their data
/// - `position`: Will be increased by amount of bytes consumed
///
/// [events]: crate::events::Event
fn read_pi(&mut self, buf: B, position: &mut usize) -> Result<(&'r [u8], bool)>;

/// Read input until comment or CDATA is finished.
///
/// This method expect that `<` already was read.
///
/// Returns a slice of data read up to end of comment, CDATA or processing
/// instruction (`>`), which does not include into result.
/// Returns a slice of data read up to end of comment or CDATA (`>`),
/// which does not include into result.
///
/// If input (`Self`) is exhausted and nothing was read, returns `None`.
///
Expand Down Expand Up @@ -1764,11 +1783,11 @@ mod test {

#[$test]
$($async)? fn processing_instruction() {
let mut reader = Reader::from_str("<?xml-stylesheet?>");
let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");

assert_eq!(
reader.$read_event($buf) $(.$await)? .unwrap(),
Event::PI(BytesText::from_escaped("xml-stylesheet"))
Event::PI(BytesText::from_escaped("xml-stylesheet '? >\" "))
);
}

Expand Down
105 changes: 105 additions & 0 deletions src/reader/pi.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
//! Contains a parser for an XML processing instruction.

/// A parser that search a `?>` sequence in the slice.
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position where
/// processing instruction is ended (the position after `?>`). If search was
/// unsuccessful, a [`None`] will be returned. You typically would expect positive
/// result of search, so that you should feed new data until yo'll get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use quick_xml::reader::PiParser;
/// # use pretty_assertions::assert_eq;
/// let mut parser = PiParser::default();
///
/// // Parse `<?instruction with = 'some > and ?' inside?>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<?instruction"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some > and ?"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b"' inside?>and the text follow..."), Some(10));
/// // ^ ^
/// // 0 10
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub struct PiParser(
/// A flag that indicates was the `bytes` in the previous attempt to find the
/// end ended with `?`.
pub bool,
);

impl PiParser {
/// Determines the end position of a processing instruction in the provided slice.
/// Processing instruction ends on the first occurrence of `?>` which cannot be
/// escaped.
///
/// Returns position after the `?>` or `None` if such sequence was not found.
///
/// [Section 2.6]: Parameter entity references MUST NOT be recognized within
/// processing instructions, so parser do not search for them.
///
/// # Parameters
/// - `bytes`: a slice to find the end of a processing instruction.
/// Should contain text in ASCII-compatible encoding
///
/// [Section 2.6]: https://www.w3.org/TR/xml11/#sec-pi
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
for i in memchr::memchr_iter(b'>', bytes) {
match i {
// +1 for `>` which should be included in event
0 if self.0 => return Some(1),
// If the previous byte is `?`, then we found `?>`
// +1 for `>` which should be included in event
i if i > 0 && bytes[i - 1] == b'?' => return Some(i + 1),
_ => {}
}
}
self.0 = bytes.last().copied() == Some(b'?');
None
}
}

#[test]
fn pi() {
use pretty_assertions::assert_eq;

/// Returns `Ok(pos)` with the position in the buffer where processing
/// instruction is ended.
///
/// Returns `Err(internal_state)` if parsing is not done yet.
fn parse_pi(bytes: &[u8], had_question_mark: bool) -> Result<usize, bool> {
let mut parser = PiParser(had_question_mark);
match parser.feed(bytes) {
Some(i) => Ok(i),
None => Err(parser.0),
}
}

// Comments shows which character was seen the last before calling `feed`.
// `x` means any character, pipe denotes start of the buffer that passed to `feed`

assert_eq!(parse_pi(b"", false), Err(false)); // x|
assert_eq!(parse_pi(b"", true), Err(false)); // ?|

assert_eq!(parse_pi(b"?", false), Err(true)); // x|?
assert_eq!(parse_pi(b"?", true), Err(true)); // ?|?

assert_eq!(parse_pi(b">", false), Err(false)); // x|>
assert_eq!(parse_pi(b">", true), Ok(1)); // ?|>

assert_eq!(parse_pi(b"?>", false), Ok(2)); // x|?>
assert_eq!(parse_pi(b"?>", true), Ok(2)); // ?|?>

assert_eq!(parse_pi(b">?>", false), Ok(3)); // x|>?>
assert_eq!(parse_pi(b">?>", true), Ok(1)); // ?|>?>
}
19 changes: 18 additions & 1 deletion src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8};
use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource};

/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
/// This implementation supports not using an intermediate buffer as the byte slice
Expand Down Expand Up @@ -275,6 +275,23 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
}
}

fn read_pi(&mut self, _buf: (), position: &mut usize) -> Result<(&'a [u8], bool)> {
let mut parser = PiParser::default();

if let Some(i) = parser.feed(self) {
*position += i;
// We does not include `>` in data
let bytes = &self[..i - 1];
*self = &self[i..];
Ok((bytes, true))
} else {
*position += self.len();
let bytes = &self[..];
*self = &[];
Ok((bytes, false))
}
}

fn read_bang_element(
&mut self,
_buf: (),
Expand Down