diff --git a/src/encoding.rs b/src/encoding.rs index 04d54495..2dc3b378 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,9 +1,10 @@ //! A module for wrappers that encode / decode data. use std::borrow::Cow; +use std::io::{self, BufRead, Read}; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; +use encoding_rs::{Decoder as ExtDecoder, Encoding, UTF_16BE, UTF_16LE, UTF_8, CoderResult}; use crate::{Error, Result}; @@ -184,4 +185,166 @@ pub fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { } } +/// A reference to an encoding together with information about how it was retrieved. +/// +/// The state transition diagram: +/// +/// ```mermaid +/// flowchart LR +/// Implicit -- from_str --> Explicit +/// Implicit -- BOM --> BomDetected +/// Implicit -- "encoding=..." --> XmlDetected +/// BomDetected -- "encoding=..." --> XmlDetected +/// ``` +#[cfg(feature = "encoding")] +#[derive(Clone, Copy)] +pub(crate) enum EncodingRef { + /// Encoding was implicitly assumed to have a specified value. It can be refined + /// using BOM or by the XML declaration event (``) + Implicit(&'static Encoding), + /// Encoding was explicitly set to the desired value. It cannot be changed + /// nor by BOM, nor by parsing XML declaration (``) + Explicit(&'static Encoding), + /// Encoding was detected from a byte order mark (BOM) or by the first bytes + /// of the content. It can be refined by the XML declaration event (``) + BomDetected(&'static Encoding), + /// Encoding was detected using XML declaration event (``). + /// It can no longer change + XmlDetected(&'static Encoding), +} +#[cfg(feature = "encoding")] +impl EncodingRef { + #[inline] + pub(crate) fn encoding(&self) -> &'static Encoding { + match self { + Self::Implicit(e) => e, + Self::Explicit(e) => e, + Self::BomDetected(e) => e, + Self::XmlDetected(e) => e, + } + } + #[inline] + pub(crate) fn can_be_refined(&self) -> bool { + match self { + Self::Implicit(_) | Self::BomDetected(_) => true, + Self::Explicit(_) | Self::XmlDetected(_) => false, + } + } +} + +#[cfg(feature = "encoding")] + +struct DecodingBufReader { + // // The buffer + // buffer: String, + // // How many bytes in the buffer currently hold significant data. + // current_position: usize, + + // /// Track whether we see errors. + // encoding: Option, + + inner: R, + decoded_buffer: Vec, + current_pos: usize, + + decoder: ExtDecoder, + encoding: EncodingRef, +} + +#[cfg(feature = "encoding")] +impl BufRead for DecodingBufReader { + fn fill_buf(&mut self) -> io::Result<&[u8]> { + self.shuffle(); + let data = self.inner.fill_buf()?; + + let amount_read_from_inner = self.feed(data)?; + self.inner.consume(amount_read_from_inner); + + Ok(data) + } + + fn consume(&mut self, amt: usize) { + self.current_pos = std::cmp::min(self.current_pos + amt, self.decoded_buffer.capacity()); + } +} + + +#[cfg(feature = "encoding")] +impl Read for DecodingBufReader { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.inner.read(buf) + } +} + +#[cfg(feature = "encoding")] +impl DecodingBufReader { + fn new(inner: R) -> Self { + DecodingBufReader { + inner: inner, + decoded_buffer: Vec::new(), + current_pos: 0, + + decoder: UTF_8.new_decoder(), + encoding: EncodingRef::Implicit(UTF_8), + } + } + + fn get_raw_buffer(&mut self) -> io::Result<&[u8]> { + self.inner.fill_buf() + } + + /// Move unconsumed data to the front of the buffer and reset the length + fn shuffle(&mut self) { + if self.current_pos == 0 { + return; + } + + // Copy all unconsumed bytes to the beginning of the buffer + self.decoded_buffer.as_mut_slice().copy_within(self.current_pos.., 0); + // Truncate the buffer + self.decoded_buffer.truncate(self.decoded_buffer.len() - self.current_pos); + self.current_pos = 0; + } + + /// Reallocate a smaller buffer with the provided size + fn shrink_buffer(&mut self, size: usize) { + self.shuffle(); + self.decoded_buffer.shrink_to(size); + } + + fn set_encoding(&mut self, encoding: &'static Encoding) { + self.encoding = EncodingRef::Explicit(encoding); + } + + fn feed(&mut self, data: &[u8]) -> io::Result { + // reserve (at least) enough space in our buffer to hold the decoded data + // encoding::max_utf8_buffer_length(data.len()) + self.decoded_buffer.reserve(data.len()); + + // The number of bytes already read from current `input` in total. + let (result, read, written, had_errors) = + self.decoder.decode_to_utf8(&data[..], + &mut self.decoded_buffer[self.current_pos..], + data.is_empty()); + self.current_pos += written; + match result { + CoderResult::InputEmpty => { + // We have consumed the current input buffer. + match had_errors { + true => Err(io::Error::new(io::ErrorKind::Other, "Errors decoding")), + false => Ok(read), + } + }, + CoderResult::OutputFull => unreachable!("This shouldn't happen, we reserved space"), + } + } +} + +#[cfg(test)] +mod tests { + +} + + + // TODO: add some tests for functions diff --git a/src/reader/mod.rs b/src/reader/mod.rs index ef663f90..79824cd4 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -3,10 +3,10 @@ use std::str::from_utf8; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_8}; +use encoding_rs::UTF_8; #[cfg(feature = "encoding")] -use crate::encoding::detect_encoding; +use crate::encoding::{detect_encoding, EncodingRef}; use crate::encoding::Decoder; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; @@ -179,53 +179,6 @@ enum TagState { Exit, } -/// A reference to an encoding together with information about how it was retrieved. -/// -/// The state transition diagram: -/// -/// ```mermaid -/// flowchart LR -/// Implicit -- from_str --> Explicit -/// Implicit -- BOM --> BomDetected -/// Implicit -- "encoding=..." --> XmlDetected -/// BomDetected -- "encoding=..." --> XmlDetected -/// ``` -#[cfg(feature = "encoding")] -#[derive(Clone, Copy)] -enum EncodingRef { - /// Encoding was implicitly assumed to have a specified value. It can be refined - /// using BOM or by the XML declaration event (``) - Implicit(&'static Encoding), - /// Encoding was explicitly set to the desired value. It cannot be changed - /// nor by BOM, nor by parsing XML declaration (``) - Explicit(&'static Encoding), - /// Encoding was detected from a byte order mark (BOM) or by the first bytes - /// of the content. It can be refined by the XML declaration event (``) - BomDetected(&'static Encoding), - /// Encoding was detected using XML declaration event (``). - /// It can no longer change - XmlDetected(&'static Encoding), -} -#[cfg(feature = "encoding")] -impl EncodingRef { - #[inline] - fn encoding(&self) -> &'static Encoding { - match self { - Self::Implicit(e) => e, - Self::Explicit(e) => e, - Self::BomDetected(e) => e, - Self::XmlDetected(e) => e, - } - } - #[inline] - fn can_be_refined(&self) -> bool { - match self { - Self::Implicit(_) | Self::BomDetected(_) => true, - Self::Explicit(_) | Self::XmlDetected(_) => false, - } - } -} - //////////////////////////////////////////////////////////////////////////////////////////////////// /// A low level encoding-agnostic XML event reader.