From 05e9702e94af93dec7f3f60dd157869c2b3a7828 Mon Sep 17 00:00:00 2001 From: Arnavion Date: Thu, 22 Nov 2018 20:25:40 -0800 Subject: [PATCH 1/6] Read files inside ZipArchive lazily. Before this change, the entire central directory would be parsed to extract the details of all files in the archive up-front. With this change, the central directory is parsed on-demand - only enough headers are parsed till the caller finds the file it wanted. This speeds up callers who only wish to pick out specific files from the archive, especially for archives with many thousands of files or archives hosted on slow storage media. In the case where the caller *did* want every file in the archive, this performs no worse than the original code. --- src/read.rs | 147 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 101 insertions(+), 46 deletions(-) diff --git a/src/read.rs b/src/read.rs index ba20214c8..05c1d8b2c 100644 --- a/src/read.rs +++ b/src/read.rs @@ -54,10 +54,12 @@ mod ffi { pub struct ZipArchive { reader: R, + number_of_files: usize, files: Vec, names_map: HashMap, offset: u64, comment: Vec, + last_central_header_end: u64, } enum ZipFileReader<'a> { @@ -203,26 +205,14 @@ impl ZipArchive let (archive_offset, directory_start, number_of_files) = Self::get_directory_counts(&mut reader, &footer, cde_start_pos)?; - let mut files = Vec::new(); - let mut names_map = HashMap::new(); - - if let Err(_) = reader.seek(io::SeekFrom::Start(directory_start)) { - return Err(ZipError::InvalidArchive("Could not seek to start of central directory")); - } - - for _ in 0 .. number_of_files - { - let file = central_header_to_zip_file(&mut reader, archive_offset)?; - names_map.insert(file.file_name.clone(), files.len()); - files.push(file); - } - Ok(ZipArchive { reader: reader, - files: files, - names_map: names_map, + number_of_files: number_of_files, + files: Vec::with_capacity(number_of_files), + names_map: HashMap::new(), offset: archive_offset, comment: footer.zip_file_comment, + last_central_header_end: directory_start, }) } @@ -240,7 +230,7 @@ impl ZipArchive /// ``` pub fn len(&self) -> usize { - self.files.len() + self.number_of_files } /// Get the offset from the beginning of the underlying reader that this zip begins at, in bytes. @@ -254,42 +244,44 @@ impl ZipArchive /// Search for a file entry by name pub fn by_name<'a>(&'a mut self, name: &str) -> ZipResult> { - let index = match self.names_map.get(name) { - Some(index) => *index, - None => { return Err(ZipError::FileNotFound); }, - }; - self.by_index(index) + if let Some(&index) = self.names_map.get(name) { + return self.by_index(index); + } + + let data = read_files_till( + &mut self.reader, + self.number_of_files, + &mut self.files, + &mut self.names_map, + self.offset, + &mut self.last_central_header_end, + ReadFilesTillPredicate::FileName(name), + )?; + + get_file_from_data(data, &mut self.reader) } /// Get a contained file by index pub fn by_index<'a>(&'a mut self, file_number: usize) -> ZipResult> { - if file_number >= self.files.len() { return Err(ZipError::FileNotFound); } - let ref mut data = self.files[file_number]; - - if data.encrypted - { - return unsupported_zip_error("Encrypted files are not supported") - } + if file_number >= self.number_of_files { return Err(ZipError::FileNotFound); } - // Parse local header - self.reader.seek(io::SeekFrom::Start(data.header_start))?; - let signature = self.reader.read_u32::()?; - if signature != spec::LOCAL_FILE_HEADER_SIGNATURE - { - return Err(ZipError::InvalidArchive("Invalid local file header")) + let data = if file_number < self.files.len() { + &mut self.files[file_number] } + else { + read_files_till( + &mut self.reader, + self.number_of_files, + &mut self.files, + &mut self.names_map, + self.offset, + &mut self.last_central_header_end, + ReadFilesTillPredicate::Number(file_number), + )? + }; - self.reader.seek(io::SeekFrom::Current(22))?; - let file_name_length = self.reader.read_u16::()? as u64; - let extra_field_length = self.reader.read_u16::()? as u64; - let magic_and_header = 4 + 22 + 2 + 2; - data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length; - - self.reader.seek(io::SeekFrom::Start(data.data_start))?; - let limit_reader = (self.reader.by_ref() as &mut Read).take(data.compressed_size); - - Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) }) + get_file_from_data(data, &mut self.reader) } /// Unwrap and return the inner reader object @@ -301,7 +293,7 @@ impl ZipArchive } } -fn central_header_to_zip_file(reader: &mut R, archive_offset: u64) -> ZipResult +fn central_header_to_zip_file(reader: &mut R, archive_offset: u64, central_header_end: &mut u64) -> ZipResult { // Parse central header let signature = reader.read_u32::()?; @@ -331,6 +323,7 @@ fn central_header_to_zip_file(reader: &mut R, archive_offset: let file_name_raw = ReadPodExt::read_exact(reader, file_name_length)?; let extra_field = ReadPodExt::read_exact(reader, extra_field_length)?; let file_comment_raw = ReadPodExt::read_exact(reader, file_comment_length)?; + *central_header_end = reader.seek(io::SeekFrom::Current(0))?; let file_name = match is_utf8 { @@ -423,6 +416,68 @@ fn get_reader<'a>(reader: &'a mut ZipFileReader) -> &'a mut Read { } } +fn get_file_from_data<'a, R>(data: &'a mut ZipFileData, reader: &'a mut R) -> ZipResult> where R: Read + io::Seek { + if data.encrypted + { + return unsupported_zip_error("Encrypted files are not supported") + } + + // Parse local header + reader.seek(io::SeekFrom::Start(data.header_start))?; + let signature = reader.read_u32::()?; + if signature != spec::LOCAL_FILE_HEADER_SIGNATURE + { + return Err(ZipError::InvalidArchive("Invalid local file header")) + } + + reader.seek(io::SeekFrom::Current(22))?; + let file_name_length = reader.read_u16::()? as u64; + let extra_field_length = reader.read_u16::()? as u64; + let magic_and_header = 4 + 22 + 2 + 2; + data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length; + + reader.seek(io::SeekFrom::Start(data.data_start))?; + let limit_reader = (reader as &mut Read).take(data.compressed_size); + + Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) }) +} + +fn read_files_till<'a, R>( + mut reader: R, + number_of_files: usize, + files: &'a mut Vec, + names_map: &mut HashMap, + offset: u64, + last_central_header_end: &mut u64, + predicate: ReadFilesTillPredicate, +) -> ZipResult<&'a mut ZipFileData> where R: Read + io::Seek { + for file_number in files.len()..number_of_files { + if let Err(_) = reader.seek(io::SeekFrom::Start(*last_central_header_end)) { + return Err(ZipError::InvalidArchive("Could not seek to start of central file header")); + } + let file = central_header_to_zip_file(&mut reader, offset, last_central_header_end)?; + + let matches = match predicate { + ReadFilesTillPredicate::FileName(file_name) => file.file_name == file_name, + ReadFilesTillPredicate::Number(number) => number == file_number, + }; + + names_map.insert(file.file_name.clone(), file_number); + files.push(file); + + if matches { + return Ok(&mut files[file_number]); + } + } + + return Err(ZipError::FileNotFound); +} + +enum ReadFilesTillPredicate<'a> { + FileName(&'a str), + Number(usize), +} + /// Methods for retrieving information on zip files impl<'a> ZipFile<'a> { fn get_reader(&mut self) -> &mut Read { From 2e5b29e6021849321028ec859ed67519c0453e58 Mon Sep 17 00:00:00 2001 From: Arnavion Date: Thu, 22 Nov 2018 23:09:18 -0800 Subject: [PATCH 2/6] Don't need to seek on every iteration. --- src/read.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/read.rs b/src/read.rs index 05c1d8b2c..cef970913 100644 --- a/src/read.rs +++ b/src/read.rs @@ -451,10 +451,11 @@ fn read_files_till<'a, R>( last_central_header_end: &mut u64, predicate: ReadFilesTillPredicate, ) -> ZipResult<&'a mut ZipFileData> where R: Read + io::Seek { + if let Err(_) = reader.seek(io::SeekFrom::Start(*last_central_header_end)) { + return Err(ZipError::InvalidArchive("Could not seek to start of central file header")); + } + for file_number in files.len()..number_of_files { - if let Err(_) = reader.seek(io::SeekFrom::Start(*last_central_header_end)) { - return Err(ZipError::InvalidArchive("Could not seek to start of central file header")); - } let file = central_header_to_zip_file(&mut reader, offset, last_central_header_end)?; let matches = match predicate { From a1eb92cc12a6741bd1d45529318ec93a48ac9b37 Mon Sep 17 00:00:00 2001 From: Arnavion Date: Fri, 23 Nov 2018 02:15:33 -0800 Subject: [PATCH 3/6] Reduce number of seeks. --- src/read.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/read.rs b/src/read.rs index cef970913..630836d50 100644 --- a/src/read.rs +++ b/src/read.rs @@ -293,7 +293,7 @@ impl ZipArchive } } -fn central_header_to_zip_file(reader: &mut R, archive_offset: u64, central_header_end: &mut u64) -> ZipResult +fn central_header_to_zip_file(reader: &mut R, archive_offset: u64) -> ZipResult { // Parse central header let signature = reader.read_u32::()?; @@ -323,7 +323,6 @@ fn central_header_to_zip_file(reader: &mut R, archive_offset: let file_name_raw = ReadPodExt::read_exact(reader, file_name_length)?; let extra_field = ReadPodExt::read_exact(reader, extra_field_length)?; let file_comment_raw = ReadPodExt::read_exact(reader, file_comment_length)?; - *central_header_end = reader.seek(io::SeekFrom::Current(0))?; let file_name = match is_utf8 { @@ -456,7 +455,7 @@ fn read_files_till<'a, R>( } for file_number in files.len()..number_of_files { - let file = central_header_to_zip_file(&mut reader, offset, last_central_header_end)?; + let file = central_header_to_zip_file(&mut reader, offset)?; let matches = match predicate { ReadFilesTillPredicate::FileName(file_name) => file.file_name == file_name, @@ -467,10 +466,12 @@ fn read_files_till<'a, R>( files.push(file); if matches { + *last_central_header_end = reader.seek(io::SeekFrom::Current(0))?; return Ok(&mut files[file_number]); } } + *last_central_header_end = reader.seek(io::SeekFrom::Current(0))?; return Err(ZipError::FileNotFound); } From 72ae3e3d56b871a53eadf9ef3ff125fd443b94bb Mon Sep 17 00:00:00 2001 From: Arnavion Date: Tue, 27 Nov 2018 20:49:04 -0800 Subject: [PATCH 4/6] Poison the archive if reading a central file header fails. The reader can be misaligned if it encounters a corrupt file header before it find the desired file. For example, consider an archive that contains files A, B, C and D, and the user requests file D (say via `by_index(3)`). Let's say the reader successfully parses the headers of A and B, but fails when parsing C because it's corrupt. In this case, we want the user to receive an error, but we also want to ensure that future calls to `by_index` or `by_name` don't parse A and B's headers again. One way to do this is to preserve the end position of B's header so that future calls to `read_files_till` resume from it. However since we've already parsed B's header and know C's header failed to parse, we can simply consider all headers that haven't already been parsed to be unreachable. This commit marks the archive as poisoned if `read_files_till` fails with any error, so that only headers which have already been successfully parsed will be considered when getting files from the archive. If a file is requested that hasn't been parsed, ie its header lies beyond the known corrupted header, then the function fails immediately. --- src/read.rs | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/read.rs b/src/read.rs index 630836d50..07284fd1e 100644 --- a/src/read.rs +++ b/src/read.rs @@ -8,6 +8,7 @@ use std::io; use std::io::prelude::*; use std::collections::HashMap; use std::borrow::Cow; +use std::mem; use podio::{ReadPodExt, LittleEndian}; use types::{ZipFileData, System, DateTime}; @@ -59,7 +60,7 @@ pub struct ZipArchive names_map: HashMap, offset: u64, comment: Vec, - last_central_header_end: u64, + last_central_header_end: LastCentralHeaderEnd, } enum ZipFileReader<'a> { @@ -71,6 +72,12 @@ enum ZipFileReader<'a> { Bzip2(Crc32Reader>>), } +#[derive(Clone, Copy, Debug)] +enum LastCentralHeaderEnd { + Pos(u64), + Poisoned, +} + /// A struct for reading a zip file pub struct ZipFile<'a> { data: Cow<'a, ZipFileData>, @@ -212,7 +219,7 @@ impl ZipArchive names_map: HashMap::new(), offset: archive_offset, comment: footer.zip_file_comment, - last_central_header_end: directory_start, + last_central_header_end: LastCentralHeaderEnd::Pos(directory_start), }) } @@ -447,11 +454,16 @@ fn read_files_till<'a, R>( files: &'a mut Vec, names_map: &mut HashMap, offset: u64, - last_central_header_end: &mut u64, + last_central_header_end: &mut LastCentralHeaderEnd, predicate: ReadFilesTillPredicate, ) -> ZipResult<&'a mut ZipFileData> where R: Read + io::Seek { - if let Err(_) = reader.seek(io::SeekFrom::Start(*last_central_header_end)) { - return Err(ZipError::InvalidArchive("Could not seek to start of central file header")); + let last_central_header_end_pos = match mem::replace(last_central_header_end, LastCentralHeaderEnd::Poisoned) { + LastCentralHeaderEnd::Pos(pos) => pos, + LastCentralHeaderEnd::Poisoned => return Err(ZipError::InvalidArchive("Central file header is corrupt")), + }; + + if let Err(_) = reader.seek(io::SeekFrom::Start(last_central_header_end_pos)) { + return Err(ZipError::InvalidArchive("Could not seek to start of next central file header")); } for file_number in files.len()..number_of_files { @@ -466,12 +478,12 @@ fn read_files_till<'a, R>( files.push(file); if matches { - *last_central_header_end = reader.seek(io::SeekFrom::Current(0))?; + *last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?); return Ok(&mut files[file_number]); } } - *last_central_header_end = reader.seek(io::SeekFrom::Current(0))?; + *last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?); return Err(ZipError::FileNotFound); } From 8ad6bd4ecb394c00eb2fa138d68bc34d9a227820 Mon Sep 17 00:00:00 2001 From: Sam Rijs Date: Wed, 5 Dec 2018 21:14:03 +1100 Subject: [PATCH 5/6] zero-copy central directory parsing --- Cargo.toml | 2 + src/buffer.rs | 128 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/cp437.rs | 15 ++++++ src/lib.rs | 3 ++ src/read.rs | 117 ++++++++++++++++++++------------------------- src/types.rs | 18 +++---- src/write.rs | 7 +-- 7 files changed, 214 insertions(+), 76 deletions(-) create mode 100644 src/buffer.rs diff --git a/Cargo.toml b/Cargo.toml index 04c5e2909..3dc57ed90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ podio = "0.1" bzip2 = { version = "0.3", optional = true } libflate = { version = "0.1.16", optional = true } crc32fast = "1.0" +bytes = "0.4.11" +string = "0.1.2" [dev-dependencies] bencher = "0.1" diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 000000000..60a9eb789 --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,128 @@ +use std::borrow::{Borrow, Cow}; +use std::io::Read; +use std::ops::Deref; +use std::hash::{Hash, Hasher}; + +use bytes::{Buf, Bytes}; +use string; + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct StrBuf(string::String); + +impl StrBuf { + pub(crate) fn from_str(s: &str) -> Self { + StrBuf(string::String::from_str(s)) + } + + pub(crate) fn from_utf8(bytes: ByteBuf) -> Result { + Ok(StrBuf(string::TryFrom::try_from(bytes.0)?)) + } + + pub(crate) fn from_utf8_lossy(bytes: ByteBuf) -> Self { + match String::from_utf8_lossy(bytes.as_ref()) { + Cow::Owned(s) => s.into(), + Cow::Borrowed(s) => { + // SAFETY: We know that `bytes` only contains valid utf-8, + // since the `from_utf8_lossy` operation returned the + // input verbatim. + debug_assert_eq!(s.len(), bytes.len()); + StrBuf(unsafe { string::String::from_utf8_unchecked(bytes.clone().0) }) + } + } + } +} + +impl From for StrBuf { + fn from(s: String) -> Self { + let bytes = s.into_bytes().into(); + // SAFETY: We know that `bytes` only contains valid utf-8, + // since the underlying data comes from the input string. + StrBuf(unsafe { string::String::from_utf8_unchecked(bytes) }) + } +} + +impl Hash for StrBuf { + fn hash(&self, state: &mut H) { + // Because of the impl Borrow for StrBuf, we need to make sure that the Hash + // implementations behave identically between str and StrBuf. + // + // Quoting the documentation for the Borrow trait: + // + // > Further, when providing implementations for additional traits, it needs to be + // > considered whether they should behave identical to those of the underlying type as a + // > consequence of acting as a representation of that underlying type. + // > Generic code typically uses Borrow when it relies on the identical behavior of + // > these additional trait implementations. + // > These traits will likely appear as additional trait bounds. + // + // Without this, it would be impossible to look up an entry from the names_map by &str, + // since the str and StrBuf would evaluate to different hashes, even if they represent the + // same sequence of characters. + str::hash(&*self, state) + } +} + +impl Borrow for StrBuf { + fn borrow(&self) -> &str { + self.0.borrow() + } +} + +impl Deref for StrBuf { + type Target = str; + + fn deref(&self) -> &str { + &self.0 + } +} + +#[derive(Clone, Debug)] +pub(crate) struct ByteBuf(Bytes); + +impl ByteBuf { + pub(crate) fn len(&self) -> usize { + self.0.len() + } + + pub(crate) fn split_to(&mut self, at: usize) -> ByteBuf { + ByteBuf(self.0.split_to(at)) + } +} + +impl Buf for ByteBuf { + fn remaining(&self) -> usize { + self.0.len() + } + + fn bytes(&self) -> &[u8] { + self.0.as_ref() + } + + fn advance(&mut self, cnt: usize) { + self.0.advance(cnt) + } +} + +impl AsRef<[u8]> for ByteBuf { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +impl Read for ByteBuf { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result { + self.reader().read(buf) + } +} + +impl From> for ByteBuf { + fn from(vec: Vec) -> Self { + ByteBuf(vec.into()) + } +} + +impl From for ByteBuf { + fn from(bytes: Bytes) -> Self { + ByteBuf(bytes) + } +} diff --git a/src/cp437.rs b/src/cp437.rs index 202dc8fbb..d041dbd3a 100644 --- a/src/cp437.rs +++ b/src/cp437.rs @@ -1,5 +1,7 @@ //! Convert a string in IBM codepage 437 to UTF-8 +use buffer::{StrBuf, ByteBuf}; + /// Trait to convert IBM codepage 437 to the target type pub trait FromCp437 { /// Target type @@ -37,6 +39,19 @@ impl FromCp437 for Vec { } } +impl FromCp437 for ByteBuf { + type Target = StrBuf; + + fn from_cp437(self) -> Self::Target { + if self.as_ref().iter().all(|c| *c < 0x80) { + StrBuf::from_utf8(self).unwrap() + } + else { + self.as_ref().into_iter().map(|c| to_char(*c)).collect::().into() + } + } +} + fn to_char(input: u8) -> char { let output = match input diff --git a/src/lib.rs b/src/lib.rs index ef2fd0494..ab99509e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,12 +2,14 @@ #![warn(missing_docs)] +extern crate bytes; #[cfg(feature = "bzip2")] extern crate bzip2; extern crate crc32fast; #[cfg(feature = "deflate")] extern crate libflate; extern crate podio; +extern crate string; #[cfg(feature = "time")] extern crate time; @@ -16,6 +18,7 @@ pub use write::ZipWriter; pub use compression::CompressionMethod; pub use types::DateTime; +mod buffer; mod spec; mod crc32; mod types; diff --git a/src/read.rs b/src/read.rs index 07284fd1e..4c7e57813 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1,5 +1,6 @@ //! Structs for reading a ZIP archive +use bytes::Buf; use crc32::Crc32Reader; use compression::CompressionMethod; use spec; @@ -8,8 +9,8 @@ use std::io; use std::io::prelude::*; use std::collections::HashMap; use std::borrow::Cow; -use std::mem; +use buffer::{ByteBuf, StrBuf}; use podio::{ReadPodExt, LittleEndian}; use types::{ZipFileData, System, DateTime}; use cp437::FromCp437; @@ -57,10 +58,10 @@ pub struct ZipArchive reader: R, number_of_files: usize, files: Vec, - names_map: HashMap, + names_map: HashMap, offset: u64, comment: Vec, - last_central_header_end: LastCentralHeaderEnd, + central_directory_bytes: ByteBuf, } enum ZipFileReader<'a> { @@ -72,12 +73,6 @@ enum ZipFileReader<'a> { Bzip2(Crc32Reader>>), } -#[derive(Clone, Copy, Debug)] -enum LastCentralHeaderEnd { - Pos(u64), - Poisoned, -} - /// A struct for reading a zip file pub struct ZipFile<'a> { data: Cow<'a, ZipFileData>, @@ -129,7 +124,7 @@ impl ZipArchive /// separate function to ease the control flow design. fn get_directory_counts(reader: &mut R, footer: &spec::CentralDirectoryEnd, - cde_start_pos: u64) -> ZipResult<(u64, u64, usize)> { + cde_start_pos: u64) -> ZipResult<(u64, u64, u64, usize)> { // See if there's a ZIP64 footer. The ZIP64 locator if present will // have its signature 20 bytes in front of the standard footer. The // standard footer, in turn, is 22+N bytes large, where N is the @@ -164,8 +159,9 @@ impl ZipArchive .ok_or(ZipError::InvalidArchive("Invalid central directory size or offset"))?; let directory_start = footer.central_directory_offset as u64 + archive_offset; + let directory_size = footer.central_directory_size as u64; let number_of_files = footer.number_of_files_on_this_disk as usize; - return Ok((archive_offset, directory_start, number_of_files)); + return Ok((archive_offset, directory_start, directory_size, number_of_files)); }, Some(locator64) => { // If we got here, this is indeed a ZIP64 file. @@ -195,7 +191,8 @@ impl ZipArchive } let directory_start = footer.central_directory_offset + archive_offset; - Ok((archive_offset, directory_start, footer.number_of_files as usize)) + let directory_size = footer.central_directory_size; + Ok((archive_offset, directory_start, directory_size, footer.number_of_files as usize)) }, } } @@ -209,17 +206,21 @@ impl ZipArchive return unsupported_zip_error("Support for multi-disk files is not implemented") } - let (archive_offset, directory_start, number_of_files) = + let (archive_offset, directory_start, directory_size, number_of_files) = Self::get_directory_counts(&mut reader, &footer, cde_start_pos)?; + let mut central_directory_bytes = vec![0; directory_size as usize]; + reader.seek(io::SeekFrom::Start(directory_start))?; + reader.read_exact(&mut central_directory_bytes)?; + Ok(ZipArchive { reader: reader, number_of_files: number_of_files, files: Vec::with_capacity(number_of_files), - names_map: HashMap::new(), + names_map: HashMap::with_capacity(number_of_files), offset: archive_offset, comment: footer.zip_file_comment, - last_central_header_end: LastCentralHeaderEnd::Pos(directory_start), + central_directory_bytes: central_directory_bytes.into(), }) } @@ -256,12 +257,11 @@ impl ZipArchive } let data = read_files_till( - &mut self.reader, self.number_of_files, &mut self.files, &mut self.names_map, self.offset, - &mut self.last_central_header_end, + &mut self.central_directory_bytes, ReadFilesTillPredicate::FileName(name), )?; @@ -278,12 +278,11 @@ impl ZipArchive } else { read_files_till( - &mut self.reader, self.number_of_files, &mut self.files, &mut self.names_map, self.offset, - &mut self.last_central_header_end, + &mut self.central_directory_bytes, ReadFilesTillPredicate::Number(file_number), )? }; @@ -300,45 +299,45 @@ impl ZipArchive } } -fn central_header_to_zip_file(reader: &mut R, archive_offset: u64) -> ZipResult +fn central_header_to_zip_file(bytes: &mut ByteBuf, archive_offset: u64) -> ZipResult { // Parse central header - let signature = reader.read_u32::()?; + let signature = bytes.read_u32::()?; if signature != spec::CENTRAL_DIRECTORY_HEADER_SIGNATURE { return Err(ZipError::InvalidArchive("Invalid Central Directory header")) } - let version_made_by = reader.read_u16::()?; - let _version_to_extract = reader.read_u16::()?; - let flags = reader.read_u16::()?; + let version_made_by = bytes.get_u16_le(); + let _version_to_extract = bytes.get_u16_le(); + let flags = bytes.get_u16_le(); let encrypted = flags & 1 == 1; let is_utf8 = flags & (1 << 11) != 0; - let compression_method = reader.read_u16::()?; - let last_mod_time = reader.read_u16::()?; - let last_mod_date = reader.read_u16::()?; - let crc32 = reader.read_u32::()?; - let compressed_size = reader.read_u32::()?; - let uncompressed_size = reader.read_u32::()?; - let file_name_length = reader.read_u16::()? as usize; - let extra_field_length = reader.read_u16::()? as usize; - let file_comment_length = reader.read_u16::()? as usize; - let _disk_number = reader.read_u16::()?; - let _internal_file_attributes = reader.read_u16::()?; - let external_file_attributes = reader.read_u32::()?; - let offset = reader.read_u32::()? as u64; - let file_name_raw = ReadPodExt::read_exact(reader, file_name_length)?; - let extra_field = ReadPodExt::read_exact(reader, extra_field_length)?; - let file_comment_raw = ReadPodExt::read_exact(reader, file_comment_length)?; + let compression_method = bytes.get_u16_le(); + let last_mod_time = bytes.get_u16_le(); + let last_mod_date = bytes.get_u16_le(); + let crc32 = bytes.get_u32_le(); + let compressed_size = bytes.get_u32_le(); + let uncompressed_size = bytes.get_u32_le(); + let file_name_length = bytes.get_u16_le() as usize; + let extra_field_length = bytes.get_u16_le() as usize; + let file_comment_length = bytes.get_u16_le() as usize; + let _disk_number = bytes.get_u16_le(); + let _internal_file_attributes = bytes.get_u16_le(); + let external_file_attributes = bytes.get_u32_le(); + let offset = bytes.get_u32_le() as u64; + let file_name_raw = bytes.split_to(file_name_length); + let extra_field = bytes.split_to(extra_field_length); + let file_comment_raw = bytes.split_to(file_comment_length); let file_name = match is_utf8 { - true => String::from_utf8_lossy(&*file_name_raw).into_owned(), + true => StrBuf::from_utf8_lossy(file_name_raw.clone()), false => file_name_raw.clone().from_cp437(), }; let file_comment = match is_utf8 { - true => String::from_utf8_lossy(&*file_comment_raw).into_owned(), + true => StrBuf::from_utf8_lossy(file_comment_raw), false => file_comment_raw.from_cp437(), }; @@ -361,7 +360,7 @@ fn central_header_to_zip_file(reader: &mut R, archive_offset: external_attributes: external_file_attributes, }; - match parse_extra_field(&mut result, &*extra_field) { + match parse_extra_field(&mut result, extra_field.as_ref()) { Ok(..) | Err(ZipError::Io(..)) => {}, Err(e) => Err(e)?, } @@ -448,29 +447,19 @@ fn get_file_from_data<'a, R>(data: &'a mut ZipFileData, reader: &'a mut R) -> Zi Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) }) } -fn read_files_till<'a, R>( - mut reader: R, +fn read_files_till<'a>( number_of_files: usize, files: &'a mut Vec, - names_map: &mut HashMap, + names_map: &mut HashMap, offset: u64, - last_central_header_end: &mut LastCentralHeaderEnd, + central_directory_bytes: &mut ByteBuf, predicate: ReadFilesTillPredicate, -) -> ZipResult<&'a mut ZipFileData> where R: Read + io::Seek { - let last_central_header_end_pos = match mem::replace(last_central_header_end, LastCentralHeaderEnd::Poisoned) { - LastCentralHeaderEnd::Pos(pos) => pos, - LastCentralHeaderEnd::Poisoned => return Err(ZipError::InvalidArchive("Central file header is corrupt")), - }; - - if let Err(_) = reader.seek(io::SeekFrom::Start(last_central_header_end_pos)) { - return Err(ZipError::InvalidArchive("Could not seek to start of next central file header")); - } - +) -> ZipResult<&'a mut ZipFileData> { for file_number in files.len()..number_of_files { - let file = central_header_to_zip_file(&mut reader, offset)?; + let file = central_header_to_zip_file(central_directory_bytes, offset)?; let matches = match predicate { - ReadFilesTillPredicate::FileName(file_name) => file.file_name == file_name, + ReadFilesTillPredicate::FileName(file_name) => &*file.file_name == file_name, ReadFilesTillPredicate::Number(number) => number == file_number, }; @@ -478,12 +467,10 @@ fn read_files_till<'a, R>( files.push(file); if matches { - *last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?); return Ok(&mut files[file_number]); } } - *last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?); return Err(ZipError::FileNotFound); } @@ -507,7 +494,7 @@ impl<'a> ZipFile<'a> { } /// Get the name of the file, in the raw (internal) byte representation. pub fn name_raw(&self) -> &[u8] { - &*self.data.file_name_raw + self.data.file_name_raw.as_ref() } /// Get the name of the file in a sanitized form. It truncates the name to the first NULL byte, /// removes a leading '/' and removes '..' parts. @@ -664,9 +651,9 @@ pub fn read_zipfile_from_stream<'a, R: io::Read>(reader: &'a mut R) -> ZipResult crc32: crc32, compressed_size: compressed_size as u64, uncompressed_size: uncompressed_size as u64, - file_name: file_name, - file_name_raw: file_name_raw, - file_comment: String::new(), // file comment is only available in the central directory + file_name: file_name.into(), + file_name_raw: file_name_raw.into(), + file_comment: StrBuf::from_str(""), // file comment is only available in the central directory // header_start and data start are not available, but also don't matter, since seeking is // not available. header_start: 0, diff --git a/src/types.rs b/src/types.rs index 2abe3a36f..62de092f0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,7 @@ //! Types that specify what is contained in a ZIP. +use buffer::{ByteBuf, StrBuf}; + #[derive(Clone, Copy, Debug, PartialEq)] pub enum System { @@ -193,7 +195,7 @@ pub const DEFAULT_VERSION: u8 = 46; /// Structure representing a ZIP file. #[derive(Debug, Clone)] -pub struct ZipFileData +pub(crate) struct ZipFileData { /// Compatibility of the file attribute information pub system: System, @@ -212,11 +214,11 @@ pub struct ZipFileData /// Size of the file when extracted pub uncompressed_size: u64, /// Name of the file - pub file_name: String, + pub file_name: StrBuf, /// Raw file name. To be used when file_name was incorrectly decoded. - pub file_name_raw: Vec, + pub file_name_raw: ByteBuf, /// File comment - pub file_comment: String, + pub file_comment: StrBuf, /// Specifies where the local header of the file starts pub header_start: u64, /// Specifies where the compressed data of the file starts @@ -278,7 +280,7 @@ mod test { #[test] fn sanitize() { use super::*; - let file_name = "/path/../../../../etc/./passwd\0/etc/shadow".to_string(); + let file_name = "/path/../../../../etc/./passwd\0/etc/shadow"; let data = ZipFileData { system: System::Dos, version_made_by: 0, @@ -288,9 +290,9 @@ mod test { crc32: 0, compressed_size: 0, uncompressed_size: 0, - file_name: file_name.clone(), - file_name_raw: file_name.into_bytes(), - file_comment: String::new(), + file_name: StrBuf::from_str(file_name), + file_name_raw: ByteBuf::from(file_name.as_bytes().to_vec()), + file_comment: StrBuf::from_str(""), header_start: 0, data_start: 0, external_attributes: 0, diff --git a/src/write.rs b/src/write.rs index 4dec56f87..e9483d07e 100644 --- a/src/write.rs +++ b/src/write.rs @@ -1,5 +1,6 @@ //! Structs for creating a new zip archive +use buffer::StrBuf; use compression::CompressionMethod; use types::{ZipFileData, System, DEFAULT_VERSION, DateTime}; use spec; @@ -198,9 +199,9 @@ impl ZipWriter crc32: 0, compressed_size: 0, uncompressed_size: 0, - file_name: file_name, - file_name_raw: file_name_raw, - file_comment: String::new(), + file_name: file_name.into(), + file_name_raw: file_name_raw.into(), + file_comment: StrBuf::from_str(""), header_start: header_start, data_start: 0, external_attributes: permissions << 16, From 5d5491d0e9d1e7578c83043f058ca2dcc1a6c27f Mon Sep 17 00:00:00 2001 From: Sam Rijs Date: Sat, 15 Dec 2018 16:49:15 +1100 Subject: [PATCH 6/6] improve bounds checking for central directory header parsing --- src/read.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/read.rs b/src/read.rs index 4c7e57813..2ce967e39 100644 --- a/src/read.rs +++ b/src/read.rs @@ -308,6 +308,11 @@ fn central_header_to_zip_file(bytes: &mut ByteBuf, archive_offset: u64) -> ZipRe return Err(ZipError::InvalidArchive("Invalid Central Directory header")) } + // ensure that the header is big enough to parse the following fixed-length sections + if bytes.remaining() < 42 { + return Err(ZipError::Io(io::ErrorKind::UnexpectedEof.into())); + } + let version_made_by = bytes.get_u16_le(); let _version_to_extract = bytes.get_u16_le(); let flags = bytes.get_u16_le(); @@ -326,6 +331,12 @@ fn central_header_to_zip_file(bytes: &mut ByteBuf, archive_offset: u64) -> ZipRe let _internal_file_attributes = bytes.get_u16_le(); let external_file_attributes = bytes.get_u32_le(); let offset = bytes.get_u32_le() as u64; + + // ensure that the header is big enough to parse the following variable-length sections + if bytes.remaining() < file_name_length + extra_field_length + file_comment_length { + return Err(ZipError::Io(io::ErrorKind::UnexpectedEof.into())); + } + let file_name_raw = bytes.split_to(file_name_length); let extra_field = bytes.split_to(extra_field_length); let file_comment_raw = bytes.split_to(file_comment_length);