diff --git a/Cargo.toml b/Cargo.toml index 04c5e2909..3dc57ed90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,8 @@ podio = "0.1" bzip2 = { version = "0.3", optional = true } libflate = { version = "0.1.16", optional = true } crc32fast = "1.0" +bytes = "0.4.11" +string = "0.1.2" [dev-dependencies] bencher = "0.1" diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 000000000..60a9eb789 --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,128 @@ +use std::borrow::{Borrow, Cow}; +use std::io::Read; +use std::ops::Deref; +use std::hash::{Hash, Hasher}; + +use bytes::{Buf, Bytes}; +use string; + +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub(crate) struct StrBuf(string::String); + +impl StrBuf { + pub(crate) fn from_str(s: &str) -> Self { + StrBuf(string::String::from_str(s)) + } + + pub(crate) fn from_utf8(bytes: ByteBuf) -> Result { + Ok(StrBuf(string::TryFrom::try_from(bytes.0)?)) + } + + pub(crate) fn from_utf8_lossy(bytes: ByteBuf) -> Self { + match String::from_utf8_lossy(bytes.as_ref()) { + Cow::Owned(s) => s.into(), + Cow::Borrowed(s) => { + // SAFETY: We know that `bytes` only contains valid utf-8, + // since the `from_utf8_lossy` operation returned the + // input verbatim. + debug_assert_eq!(s.len(), bytes.len()); + StrBuf(unsafe { string::String::from_utf8_unchecked(bytes.clone().0) }) + } + } + } +} + +impl From for StrBuf { + fn from(s: String) -> Self { + let bytes = s.into_bytes().into(); + // SAFETY: We know that `bytes` only contains valid utf-8, + // since the underlying data comes from the input string. + StrBuf(unsafe { string::String::from_utf8_unchecked(bytes) }) + } +} + +impl Hash for StrBuf { + fn hash(&self, state: &mut H) { + // Because of the impl Borrow for StrBuf, we need to make sure that the Hash + // implementations behave identically between str and StrBuf. + // + // Quoting the documentation for the Borrow trait: + // + // > Further, when providing implementations for additional traits, it needs to be + // > considered whether they should behave identical to those of the underlying type as a + // > consequence of acting as a representation of that underlying type. + // > Generic code typically uses Borrow when it relies on the identical behavior of + // > these additional trait implementations. + // > These traits will likely appear as additional trait bounds. + // + // Without this, it would be impossible to look up an entry from the names_map by &str, + // since the str and StrBuf would evaluate to different hashes, even if they represent the + // same sequence of characters. + str::hash(&*self, state) + } +} + +impl Borrow for StrBuf { + fn borrow(&self) -> &str { + self.0.borrow() + } +} + +impl Deref for StrBuf { + type Target = str; + + fn deref(&self) -> &str { + &self.0 + } +} + +#[derive(Clone, Debug)] +pub(crate) struct ByteBuf(Bytes); + +impl ByteBuf { + pub(crate) fn len(&self) -> usize { + self.0.len() + } + + pub(crate) fn split_to(&mut self, at: usize) -> ByteBuf { + ByteBuf(self.0.split_to(at)) + } +} + +impl Buf for ByteBuf { + fn remaining(&self) -> usize { + self.0.len() + } + + fn bytes(&self) -> &[u8] { + self.0.as_ref() + } + + fn advance(&mut self, cnt: usize) { + self.0.advance(cnt) + } +} + +impl AsRef<[u8]> for ByteBuf { + fn as_ref(&self) -> &[u8] { + self.0.as_ref() + } +} + +impl Read for ByteBuf { + fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result { + self.reader().read(buf) + } +} + +impl From> for ByteBuf { + fn from(vec: Vec) -> Self { + ByteBuf(vec.into()) + } +} + +impl From for ByteBuf { + fn from(bytes: Bytes) -> Self { + ByteBuf(bytes) + } +} diff --git a/src/cp437.rs b/src/cp437.rs index 202dc8fbb..d041dbd3a 100644 --- a/src/cp437.rs +++ b/src/cp437.rs @@ -1,5 +1,7 @@ //! Convert a string in IBM codepage 437 to UTF-8 +use buffer::{StrBuf, ByteBuf}; + /// Trait to convert IBM codepage 437 to the target type pub trait FromCp437 { /// Target type @@ -37,6 +39,19 @@ impl FromCp437 for Vec { } } +impl FromCp437 for ByteBuf { + type Target = StrBuf; + + fn from_cp437(self) -> Self::Target { + if self.as_ref().iter().all(|c| *c < 0x80) { + StrBuf::from_utf8(self).unwrap() + } + else { + self.as_ref().into_iter().map(|c| to_char(*c)).collect::().into() + } + } +} + fn to_char(input: u8) -> char { let output = match input diff --git a/src/lib.rs b/src/lib.rs index ef2fd0494..ab99509e2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,12 +2,14 @@ #![warn(missing_docs)] +extern crate bytes; #[cfg(feature = "bzip2")] extern crate bzip2; extern crate crc32fast; #[cfg(feature = "deflate")] extern crate libflate; extern crate podio; +extern crate string; #[cfg(feature = "time")] extern crate time; @@ -16,6 +18,7 @@ pub use write::ZipWriter; pub use compression::CompressionMethod; pub use types::DateTime; +mod buffer; mod spec; mod crc32; mod types; diff --git a/src/read.rs b/src/read.rs index ba20214c8..2ce967e39 100644 --- a/src/read.rs +++ b/src/read.rs @@ -1,5 +1,6 @@ //! Structs for reading a ZIP archive +use bytes::Buf; use crc32::Crc32Reader; use compression::CompressionMethod; use spec; @@ -9,6 +10,7 @@ use std::io::prelude::*; use std::collections::HashMap; use std::borrow::Cow; +use buffer::{ByteBuf, StrBuf}; use podio::{ReadPodExt, LittleEndian}; use types::{ZipFileData, System, DateTime}; use cp437::FromCp437; @@ -54,10 +56,12 @@ mod ffi { pub struct ZipArchive { reader: R, + number_of_files: usize, files: Vec, - names_map: HashMap, + names_map: HashMap, offset: u64, comment: Vec, + central_directory_bytes: ByteBuf, } enum ZipFileReader<'a> { @@ -120,7 +124,7 @@ impl ZipArchive /// separate function to ease the control flow design. fn get_directory_counts(reader: &mut R, footer: &spec::CentralDirectoryEnd, - cde_start_pos: u64) -> ZipResult<(u64, u64, usize)> { + cde_start_pos: u64) -> ZipResult<(u64, u64, u64, usize)> { // See if there's a ZIP64 footer. The ZIP64 locator if present will // have its signature 20 bytes in front of the standard footer. The // standard footer, in turn, is 22+N bytes large, where N is the @@ -155,8 +159,9 @@ impl ZipArchive .ok_or(ZipError::InvalidArchive("Invalid central directory size or offset"))?; let directory_start = footer.central_directory_offset as u64 + archive_offset; + let directory_size = footer.central_directory_size as u64; let number_of_files = footer.number_of_files_on_this_disk as usize; - return Ok((archive_offset, directory_start, number_of_files)); + return Ok((archive_offset, directory_start, directory_size, number_of_files)); }, Some(locator64) => { // If we got here, this is indeed a ZIP64 file. @@ -186,7 +191,8 @@ impl ZipArchive } let directory_start = footer.central_directory_offset + archive_offset; - Ok((archive_offset, directory_start, footer.number_of_files as usize)) + let directory_size = footer.central_directory_size; + Ok((archive_offset, directory_start, directory_size, footer.number_of_files as usize)) }, } } @@ -200,29 +206,21 @@ impl ZipArchive return unsupported_zip_error("Support for multi-disk files is not implemented") } - let (archive_offset, directory_start, number_of_files) = + let (archive_offset, directory_start, directory_size, number_of_files) = Self::get_directory_counts(&mut reader, &footer, cde_start_pos)?; - let mut files = Vec::new(); - let mut names_map = HashMap::new(); - - if let Err(_) = reader.seek(io::SeekFrom::Start(directory_start)) { - return Err(ZipError::InvalidArchive("Could not seek to start of central directory")); - } - - for _ in 0 .. number_of_files - { - let file = central_header_to_zip_file(&mut reader, archive_offset)?; - names_map.insert(file.file_name.clone(), files.len()); - files.push(file); - } + let mut central_directory_bytes = vec![0; directory_size as usize]; + reader.seek(io::SeekFrom::Start(directory_start))?; + reader.read_exact(&mut central_directory_bytes)?; Ok(ZipArchive { reader: reader, - files: files, - names_map: names_map, + number_of_files: number_of_files, + files: Vec::with_capacity(number_of_files), + names_map: HashMap::with_capacity(number_of_files), offset: archive_offset, comment: footer.zip_file_comment, + central_directory_bytes: central_directory_bytes.into(), }) } @@ -240,7 +238,7 @@ impl ZipArchive /// ``` pub fn len(&self) -> usize { - self.files.len() + self.number_of_files } /// Get the offset from the beginning of the underlying reader that this zip begins at, in bytes. @@ -254,42 +252,42 @@ impl ZipArchive /// Search for a file entry by name pub fn by_name<'a>(&'a mut self, name: &str) -> ZipResult> { - let index = match self.names_map.get(name) { - Some(index) => *index, - None => { return Err(ZipError::FileNotFound); }, - }; - self.by_index(index) + if let Some(&index) = self.names_map.get(name) { + return self.by_index(index); + } + + let data = read_files_till( + self.number_of_files, + &mut self.files, + &mut self.names_map, + self.offset, + &mut self.central_directory_bytes, + ReadFilesTillPredicate::FileName(name), + )?; + + get_file_from_data(data, &mut self.reader) } /// Get a contained file by index pub fn by_index<'a>(&'a mut self, file_number: usize) -> ZipResult> { - if file_number >= self.files.len() { return Err(ZipError::FileNotFound); } - let ref mut data = self.files[file_number]; + if file_number >= self.number_of_files { return Err(ZipError::FileNotFound); } - if data.encrypted - { - return unsupported_zip_error("Encrypted files are not supported") + let data = if file_number < self.files.len() { + &mut self.files[file_number] } + else { + read_files_till( + self.number_of_files, + &mut self.files, + &mut self.names_map, + self.offset, + &mut self.central_directory_bytes, + ReadFilesTillPredicate::Number(file_number), + )? + }; - // Parse local header - self.reader.seek(io::SeekFrom::Start(data.header_start))?; - let signature = self.reader.read_u32::()?; - if signature != spec::LOCAL_FILE_HEADER_SIGNATURE - { - return Err(ZipError::InvalidArchive("Invalid local file header")) - } - - self.reader.seek(io::SeekFrom::Current(22))?; - let file_name_length = self.reader.read_u16::()? as u64; - let extra_field_length = self.reader.read_u16::()? as u64; - let magic_and_header = 4 + 22 + 2 + 2; - data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length; - - self.reader.seek(io::SeekFrom::Start(data.data_start))?; - let limit_reader = (self.reader.by_ref() as &mut Read).take(data.compressed_size); - - Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) }) + get_file_from_data(data, &mut self.reader) } /// Unwrap and return the inner reader object @@ -301,45 +299,56 @@ impl ZipArchive } } -fn central_header_to_zip_file(reader: &mut R, archive_offset: u64) -> ZipResult +fn central_header_to_zip_file(bytes: &mut ByteBuf, archive_offset: u64) -> ZipResult { // Parse central header - let signature = reader.read_u32::()?; + let signature = bytes.read_u32::()?; if signature != spec::CENTRAL_DIRECTORY_HEADER_SIGNATURE { return Err(ZipError::InvalidArchive("Invalid Central Directory header")) } - let version_made_by = reader.read_u16::()?; - let _version_to_extract = reader.read_u16::()?; - let flags = reader.read_u16::()?; + // ensure that the header is big enough to parse the following fixed-length sections + if bytes.remaining() < 42 { + return Err(ZipError::Io(io::ErrorKind::UnexpectedEof.into())); + } + + let version_made_by = bytes.get_u16_le(); + let _version_to_extract = bytes.get_u16_le(); + let flags = bytes.get_u16_le(); let encrypted = flags & 1 == 1; let is_utf8 = flags & (1 << 11) != 0; - let compression_method = reader.read_u16::()?; - let last_mod_time = reader.read_u16::()?; - let last_mod_date = reader.read_u16::()?; - let crc32 = reader.read_u32::()?; - let compressed_size = reader.read_u32::()?; - let uncompressed_size = reader.read_u32::()?; - let file_name_length = reader.read_u16::()? as usize; - let extra_field_length = reader.read_u16::()? as usize; - let file_comment_length = reader.read_u16::()? as usize; - let _disk_number = reader.read_u16::()?; - let _internal_file_attributes = reader.read_u16::()?; - let external_file_attributes = reader.read_u32::()?; - let offset = reader.read_u32::()? as u64; - let file_name_raw = ReadPodExt::read_exact(reader, file_name_length)?; - let extra_field = ReadPodExt::read_exact(reader, extra_field_length)?; - let file_comment_raw = ReadPodExt::read_exact(reader, file_comment_length)?; + let compression_method = bytes.get_u16_le(); + let last_mod_time = bytes.get_u16_le(); + let last_mod_date = bytes.get_u16_le(); + let crc32 = bytes.get_u32_le(); + let compressed_size = bytes.get_u32_le(); + let uncompressed_size = bytes.get_u32_le(); + let file_name_length = bytes.get_u16_le() as usize; + let extra_field_length = bytes.get_u16_le() as usize; + let file_comment_length = bytes.get_u16_le() as usize; + let _disk_number = bytes.get_u16_le(); + let _internal_file_attributes = bytes.get_u16_le(); + let external_file_attributes = bytes.get_u32_le(); + let offset = bytes.get_u32_le() as u64; + + // ensure that the header is big enough to parse the following variable-length sections + if bytes.remaining() < file_name_length + extra_field_length + file_comment_length { + return Err(ZipError::Io(io::ErrorKind::UnexpectedEof.into())); + } + + let file_name_raw = bytes.split_to(file_name_length); + let extra_field = bytes.split_to(extra_field_length); + let file_comment_raw = bytes.split_to(file_comment_length); let file_name = match is_utf8 { - true => String::from_utf8_lossy(&*file_name_raw).into_owned(), + true => StrBuf::from_utf8_lossy(file_name_raw.clone()), false => file_name_raw.clone().from_cp437(), }; let file_comment = match is_utf8 { - true => String::from_utf8_lossy(&*file_comment_raw).into_owned(), + true => StrBuf::from_utf8_lossy(file_comment_raw), false => file_comment_raw.from_cp437(), }; @@ -362,7 +371,7 @@ fn central_header_to_zip_file(reader: &mut R, archive_offset: external_attributes: external_file_attributes, }; - match parse_extra_field(&mut result, &*extra_field) { + match parse_extra_field(&mut result, extra_field.as_ref()) { Ok(..) | Err(ZipError::Io(..)) => {}, Err(e) => Err(e)?, } @@ -423,6 +432,64 @@ fn get_reader<'a>(reader: &'a mut ZipFileReader) -> &'a mut Read { } } +fn get_file_from_data<'a, R>(data: &'a mut ZipFileData, reader: &'a mut R) -> ZipResult> where R: Read + io::Seek { + if data.encrypted + { + return unsupported_zip_error("Encrypted files are not supported") + } + + // Parse local header + reader.seek(io::SeekFrom::Start(data.header_start))?; + let signature = reader.read_u32::()?; + if signature != spec::LOCAL_FILE_HEADER_SIGNATURE + { + return Err(ZipError::InvalidArchive("Invalid local file header")) + } + + reader.seek(io::SeekFrom::Current(22))?; + let file_name_length = reader.read_u16::()? as u64; + let extra_field_length = reader.read_u16::()? as u64; + let magic_and_header = 4 + 22 + 2 + 2; + data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length; + + reader.seek(io::SeekFrom::Start(data.data_start))?; + let limit_reader = (reader as &mut Read).take(data.compressed_size); + + Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) }) +} + +fn read_files_till<'a>( + number_of_files: usize, + files: &'a mut Vec, + names_map: &mut HashMap, + offset: u64, + central_directory_bytes: &mut ByteBuf, + predicate: ReadFilesTillPredicate, +) -> ZipResult<&'a mut ZipFileData> { + for file_number in files.len()..number_of_files { + let file = central_header_to_zip_file(central_directory_bytes, offset)?; + + let matches = match predicate { + ReadFilesTillPredicate::FileName(file_name) => &*file.file_name == file_name, + ReadFilesTillPredicate::Number(number) => number == file_number, + }; + + names_map.insert(file.file_name.clone(), file_number); + files.push(file); + + if matches { + return Ok(&mut files[file_number]); + } + } + + return Err(ZipError::FileNotFound); +} + +enum ReadFilesTillPredicate<'a> { + FileName(&'a str), + Number(usize), +} + /// Methods for retrieving information on zip files impl<'a> ZipFile<'a> { fn get_reader(&mut self) -> &mut Read { @@ -438,7 +505,7 @@ impl<'a> ZipFile<'a> { } /// Get the name of the file, in the raw (internal) byte representation. pub fn name_raw(&self) -> &[u8] { - &*self.data.file_name_raw + self.data.file_name_raw.as_ref() } /// Get the name of the file in a sanitized form. It truncates the name to the first NULL byte, /// removes a leading '/' and removes '..' parts. @@ -595,9 +662,9 @@ pub fn read_zipfile_from_stream<'a, R: io::Read>(reader: &'a mut R) -> ZipResult crc32: crc32, compressed_size: compressed_size as u64, uncompressed_size: uncompressed_size as u64, - file_name: file_name, - file_name_raw: file_name_raw, - file_comment: String::new(), // file comment is only available in the central directory + file_name: file_name.into(), + file_name_raw: file_name_raw.into(), + file_comment: StrBuf::from_str(""), // file comment is only available in the central directory // header_start and data start are not available, but also don't matter, since seeking is // not available. header_start: 0, diff --git a/src/types.rs b/src/types.rs index 2abe3a36f..62de092f0 100644 --- a/src/types.rs +++ b/src/types.rs @@ -1,5 +1,7 @@ //! Types that specify what is contained in a ZIP. +use buffer::{ByteBuf, StrBuf}; + #[derive(Clone, Copy, Debug, PartialEq)] pub enum System { @@ -193,7 +195,7 @@ pub const DEFAULT_VERSION: u8 = 46; /// Structure representing a ZIP file. #[derive(Debug, Clone)] -pub struct ZipFileData +pub(crate) struct ZipFileData { /// Compatibility of the file attribute information pub system: System, @@ -212,11 +214,11 @@ pub struct ZipFileData /// Size of the file when extracted pub uncompressed_size: u64, /// Name of the file - pub file_name: String, + pub file_name: StrBuf, /// Raw file name. To be used when file_name was incorrectly decoded. - pub file_name_raw: Vec, + pub file_name_raw: ByteBuf, /// File comment - pub file_comment: String, + pub file_comment: StrBuf, /// Specifies where the local header of the file starts pub header_start: u64, /// Specifies where the compressed data of the file starts @@ -278,7 +280,7 @@ mod test { #[test] fn sanitize() { use super::*; - let file_name = "/path/../../../../etc/./passwd\0/etc/shadow".to_string(); + let file_name = "/path/../../../../etc/./passwd\0/etc/shadow"; let data = ZipFileData { system: System::Dos, version_made_by: 0, @@ -288,9 +290,9 @@ mod test { crc32: 0, compressed_size: 0, uncompressed_size: 0, - file_name: file_name.clone(), - file_name_raw: file_name.into_bytes(), - file_comment: String::new(), + file_name: StrBuf::from_str(file_name), + file_name_raw: ByteBuf::from(file_name.as_bytes().to_vec()), + file_comment: StrBuf::from_str(""), header_start: 0, data_start: 0, external_attributes: 0, diff --git a/src/write.rs b/src/write.rs index 4dec56f87..e9483d07e 100644 --- a/src/write.rs +++ b/src/write.rs @@ -1,5 +1,6 @@ //! Structs for creating a new zip archive +use buffer::StrBuf; use compression::CompressionMethod; use types::{ZipFileData, System, DEFAULT_VERSION, DateTime}; use spec; @@ -198,9 +199,9 @@ impl ZipWriter crc32: 0, compressed_size: 0, uncompressed_size: 0, - file_name: file_name, - file_name_raw: file_name_raw, - file_comment: String::new(), + file_name: file_name.into(), + file_name_raw: file_name_raw.into(), + file_comment: StrBuf::from_str(""), header_start: header_start, data_start: 0, external_attributes: permissions << 16,