Skip to content
This repository has been archived by the owner on Jun 2, 2024. It is now read-only.

Zero copy central directory parsing #91

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ podio = "0.1"
bzip2 = { version = "0.3", optional = true }
libflate = { version = "0.1.16", optional = true }
crc32fast = "1.0"
bytes = "0.4.11"
string = "0.1.2"

[dev-dependencies]
bencher = "0.1"
Expand Down
128 changes: 128 additions & 0 deletions src/buffer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
use std::borrow::{Borrow, Cow};
use std::io::Read;
use std::ops::Deref;
use std::hash::{Hash, Hasher};

use bytes::{Buf, Bytes};
use string;

#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub(crate) struct StrBuf(string::String<Bytes>);

impl StrBuf {
pub(crate) fn from_str(s: &str) -> Self {
StrBuf(string::String::from_str(s))
}

pub(crate) fn from_utf8(bytes: ByteBuf) -> Result<Self, ::std::str::Utf8Error> {
Ok(StrBuf(string::TryFrom::try_from(bytes.0)?))
}

pub(crate) fn from_utf8_lossy(bytes: ByteBuf) -> Self {
match String::from_utf8_lossy(bytes.as_ref()) {
Cow::Owned(s) => s.into(),
Cow::Borrowed(s) => {
// SAFETY: We know that `bytes` only contains valid utf-8,
// since the `from_utf8_lossy` operation returned the
// input verbatim.
debug_assert_eq!(s.len(), bytes.len());
StrBuf(unsafe { string::String::from_utf8_unchecked(bytes.clone().0) })
}
}
}
}

impl From<String> for StrBuf {
fn from(s: String) -> Self {
let bytes = s.into_bytes().into();
// SAFETY: We know that `bytes` only contains valid utf-8,
// since the underlying data comes from the input string.
StrBuf(unsafe { string::String::from_utf8_unchecked(bytes) })
}
}

impl Hash for StrBuf {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not #[derive] it? Is there a benefit to matching the str's hash rather than the Bytes's hash (which String<Bytes>'s Hash forwards to)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of the impl Borrow<str> for StrBuf, I need to make sure that the hash implementations behave identically between str and StrBuf.

Quoting the documentation for the Borrow trait:

Further, when providing implementations for additional traits, it needs to be considered whether they should behave identical to those of the underlying type as a consequence of acting as a representation of that underlying type. Generic code typically uses Borrow<T> when it relies on the identical behavior of these additional trait implementations. These traits will likely appear as additional trait bounds.

Without this, it would be impossible to look up an entry from the names_map by &str, since the str and StrBuf would evaluate to different hashes, even if they represent the same sequence of chars.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I didn't notice the Borrow<str> impl.

fn hash<H: Hasher>(&self, state: &mut H) {
// Because of the impl Borrow<str> for StrBuf, we need to make sure that the Hash
// implementations behave identically between str and StrBuf.
//
// Quoting the documentation for the Borrow trait:
//
// > Further, when providing implementations for additional traits, it needs to be
// > considered whether they should behave identical to those of the underlying type as a
// > consequence of acting as a representation of that underlying type.
// > Generic code typically uses Borrow<T> when it relies on the identical behavior of
// > these additional trait implementations.
// > These traits will likely appear as additional trait bounds.
//
// Without this, it would be impossible to look up an entry from the names_map by &str,
// since the str and StrBuf would evaluate to different hashes, even if they represent the
// same sequence of characters.
str::hash(&*self, state)
}
}

impl Borrow<str> for StrBuf {
fn borrow(&self) -> &str {
self.0.borrow()
}
}

impl Deref for StrBuf {
type Target = str;

fn deref(&self) -> &str {
&self.0
}
}

#[derive(Clone, Debug)]
pub(crate) struct ByteBuf(Bytes);

impl ByteBuf {
pub(crate) fn len(&self) -> usize {
self.0.len()
}

pub(crate) fn split_to(&mut self, at: usize) -> ByteBuf {
ByteBuf(self.0.split_to(at))
}
}

impl Buf for ByteBuf {
fn remaining(&self) -> usize {
self.0.len()
}

fn bytes(&self) -> &[u8] {
self.0.as_ref()
}

fn advance(&mut self, cnt: usize) {
self.0.advance(cnt)
}
}

impl AsRef<[u8]> for ByteBuf {
fn as_ref(&self) -> &[u8] {
self.0.as_ref()
}
}

impl Read for ByteBuf {
fn read(&mut self, buf: &mut [u8]) -> ::std::io::Result<usize> {
self.reader().read(buf)
}
}

impl From<Vec<u8>> for ByteBuf {
fn from(vec: Vec<u8>) -> Self {
ByteBuf(vec.into())
}
}

impl From<Bytes> for ByteBuf {
fn from(bytes: Bytes) -> Self {
ByteBuf(bytes)
}
}
15 changes: 15 additions & 0 deletions src/cp437.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! Convert a string in IBM codepage 437 to UTF-8

use buffer::{StrBuf, ByteBuf};

/// Trait to convert IBM codepage 437 to the target type
pub trait FromCp437 {
/// Target type
Expand Down Expand Up @@ -37,6 +39,19 @@ impl FromCp437 for Vec<u8> {
}
}

impl FromCp437 for ByteBuf {
type Target = StrBuf;

fn from_cp437(self) -> Self::Target {
if self.as_ref().iter().all(|c| *c < 0x80) {
StrBuf::from_utf8(self).unwrap()
}
else {
self.as_ref().into_iter().map(|c| to_char(*c)).collect::<String>().into()
}
}
}

fn to_char(input: u8) -> char
{
let output = match input
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

#![warn(missing_docs)]

extern crate bytes;
#[cfg(feature = "bzip2")]
extern crate bzip2;
extern crate crc32fast;
#[cfg(feature = "deflate")]
extern crate libflate;
extern crate podio;
extern crate string;
#[cfg(feature = "time")]
extern crate time;

Expand All @@ -16,6 +18,7 @@ pub use write::ZipWriter;
pub use compression::CompressionMethod;
pub use types::DateTime;

mod buffer;
mod spec;
mod crc32;
mod types;
Expand Down
Loading