Skip to content
This repository has been archived by the owner on Jun 2, 2024. It is now read-only.

Read files inside ZipArchive lazily #89

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 114 additions & 45 deletions src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::io;
use std::io::prelude::*;
use std::collections::HashMap;
use std::borrow::Cow;
use std::mem;

use podio::{ReadPodExt, LittleEndian};
use types::{ZipFileData, System, DateTime};
Expand Down Expand Up @@ -54,10 +55,12 @@ mod ffi {
pub struct ZipArchive<R: Read + io::Seek>
{
reader: R,
number_of_files: usize,
files: Vec<ZipFileData>,
names_map: HashMap<String, usize>,
offset: u64,
comment: Vec<u8>,
last_central_header_end: LastCentralHeaderEnd,
}

enum ZipFileReader<'a> {
Expand All @@ -69,6 +72,12 @@ enum ZipFileReader<'a> {
Bzip2(Crc32Reader<BzDecoder<io::Take<&'a mut Read>>>),
}

#[derive(Clone, Copy, Debug)]
enum LastCentralHeaderEnd {
Pos(u64),
Poisoned,
}

/// A struct for reading a zip file
pub struct ZipFile<'a> {
data: Cow<'a, ZipFileData>,
Expand Down Expand Up @@ -203,26 +212,14 @@ impl<R: Read+io::Seek> ZipArchive<R>
let (archive_offset, directory_start, number_of_files) =
Self::get_directory_counts(&mut reader, &footer, cde_start_pos)?;

let mut files = Vec::new();
let mut names_map = HashMap::new();

if let Err(_) = reader.seek(io::SeekFrom::Start(directory_start)) {
return Err(ZipError::InvalidArchive("Could not seek to start of central directory"));
}

for _ in 0 .. number_of_files
{
let file = central_header_to_zip_file(&mut reader, archive_offset)?;
names_map.insert(file.file_name.clone(), files.len());
files.push(file);
}

Ok(ZipArchive {
reader: reader,
files: files,
names_map: names_map,
number_of_files: number_of_files,
files: Vec::with_capacity(number_of_files),
names_map: HashMap::new(),
offset: archive_offset,
comment: footer.zip_file_comment,
last_central_header_end: LastCentralHeaderEnd::Pos(directory_start),
})
}

Expand All @@ -240,7 +237,7 @@ impl<R: Read+io::Seek> ZipArchive<R>
/// ```
pub fn len(&self) -> usize
{
self.files.len()
self.number_of_files
}

/// Get the offset from the beginning of the underlying reader that this zip begins at, in bytes.
Expand All @@ -254,42 +251,44 @@ impl<R: Read+io::Seek> ZipArchive<R>
/// Search for a file entry by name
pub fn by_name<'a>(&'a mut self, name: &str) -> ZipResult<ZipFile<'a>>
{
let index = match self.names_map.get(name) {
Some(index) => *index,
None => { return Err(ZipError::FileNotFound); },
};
self.by_index(index)
if let Some(&index) = self.names_map.get(name) {
return self.by_index(index);
}

let data = read_files_till(
&mut self.reader,
self.number_of_files,
&mut self.files,
&mut self.names_map,
self.offset,
&mut self.last_central_header_end,
ReadFilesTillPredicate::FileName(name),
)?;

get_file_from_data(data, &mut self.reader)
}

/// Get a contained file by index
pub fn by_index<'a>(&'a mut self, file_number: usize) -> ZipResult<ZipFile<'a>>
{
if file_number >= self.files.len() { return Err(ZipError::FileNotFound); }
let ref mut data = self.files[file_number];
if file_number >= self.number_of_files { return Err(ZipError::FileNotFound); }

if data.encrypted
{
return unsupported_zip_error("Encrypted files are not supported")
}

// Parse local header
self.reader.seek(io::SeekFrom::Start(data.header_start))?;
let signature = self.reader.read_u32::<LittleEndian>()?;
if signature != spec::LOCAL_FILE_HEADER_SIGNATURE
{
return Err(ZipError::InvalidArchive("Invalid local file header"))
let data = if file_number < self.files.len() {
&mut self.files[file_number]
}
else {
read_files_till(
&mut self.reader,
self.number_of_files,
&mut self.files,
&mut self.names_map,
self.offset,
&mut self.last_central_header_end,
ReadFilesTillPredicate::Number(file_number),
)?
};

self.reader.seek(io::SeekFrom::Current(22))?;
let file_name_length = self.reader.read_u16::<LittleEndian>()? as u64;
let extra_field_length = self.reader.read_u16::<LittleEndian>()? as u64;
let magic_and_header = 4 + 22 + 2 + 2;
data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length;

self.reader.seek(io::SeekFrom::Start(data.data_start))?;
let limit_reader = (self.reader.by_ref() as &mut Read).take(data.compressed_size);

Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) })
get_file_from_data(data, &mut self.reader)
}

/// Unwrap and return the inner reader object
Expand Down Expand Up @@ -423,6 +422,76 @@ fn get_reader<'a>(reader: &'a mut ZipFileReader) -> &'a mut Read {
}
}

fn get_file_from_data<'a, R>(data: &'a mut ZipFileData, reader: &'a mut R) -> ZipResult<ZipFile<'a>> where R: Read + io::Seek {
if data.encrypted
{
return unsupported_zip_error("Encrypted files are not supported")
}

// Parse local header
reader.seek(io::SeekFrom::Start(data.header_start))?;
let signature = reader.read_u32::<LittleEndian>()?;
if signature != spec::LOCAL_FILE_HEADER_SIGNATURE
{
return Err(ZipError::InvalidArchive("Invalid local file header"))
}

reader.seek(io::SeekFrom::Current(22))?;
let file_name_length = reader.read_u16::<LittleEndian>()? as u64;
let extra_field_length = reader.read_u16::<LittleEndian>()? as u64;
let magic_and_header = 4 + 22 + 2 + 2;
data.data_start = data.header_start + magic_and_header + file_name_length + extra_field_length;

reader.seek(io::SeekFrom::Start(data.data_start))?;
let limit_reader = (reader as &mut Read).take(data.compressed_size);

Ok(ZipFile { reader: make_reader(data.compression_method, data.crc32, limit_reader)?, data: Cow::Borrowed(data) })
}

fn read_files_till<'a, R>(
mut reader: R,
number_of_files: usize,
files: &'a mut Vec<ZipFileData>,
names_map: &mut HashMap<String, usize>,
offset: u64,
last_central_header_end: &mut LastCentralHeaderEnd,
predicate: ReadFilesTillPredicate,
) -> ZipResult<&'a mut ZipFileData> where R: Read + io::Seek {
let last_central_header_end_pos = match mem::replace(last_central_header_end, LastCentralHeaderEnd::Poisoned) {
LastCentralHeaderEnd::Pos(pos) => pos,
LastCentralHeaderEnd::Poisoned => return Err(ZipError::InvalidArchive("Central file header is corrupt")),
};

if let Err(_) = reader.seek(io::SeekFrom::Start(last_central_header_end_pos)) {
return Err(ZipError::InvalidArchive("Could not seek to start of next central file header"));
}

for file_number in files.len()..number_of_files {
let file = central_header_to_zip_file(&mut reader, offset)?;

let matches = match predicate {
ReadFilesTillPredicate::FileName(file_name) => file.file_name == file_name,
ReadFilesTillPredicate::Number(number) => number == file_number,
};

names_map.insert(file.file_name.clone(), file_number);
files.push(file);

if matches {
*last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?);
return Ok(&mut files[file_number]);
}
}

*last_central_header_end = LastCentralHeaderEnd::Pos(reader.seek(io::SeekFrom::Current(0))?);
return Err(ZipError::FileNotFound);
}

enum ReadFilesTillPredicate<'a> {
FileName(&'a str),
Number(usize),
}

/// Methods for retrieving information on zip files
impl<'a> ZipFile<'a> {
fn get_reader(&mut self) -> &mut Read {
Expand Down