From 9b4459213c7ce5432d53be92cd2cf56d1221740e Mon Sep 17 00:00:00 2001 From: Alex Crichton Date: Sat, 17 Nov 2018 22:28:30 -0800 Subject: [PATCH] Implement a SIMD fast path for CRC checksums Recently I was profiling Cargo's extraction of tarballs and was quite surprised to learn that 15% of the execution time of tarball extraction was entirely crc32 checksum calculations in miniz. This was quite a surprise to me and led me down a long rabbit hole of figuring out how to speed this up! It turns out Intel's written a paper, "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction", which describes how to implement a CRC-32 value using hardware instructions. Note that these are not the hardware CRC instructions, which I think are a different algorithm. This commit implements this paper in Rust, looking to a few other external implementations for guidance as well. Overall the results are quite promising, and I'm pretty confident in the correctness of this as well. Current results look like: * This SIMD implementation runs at about 25GB/s * The miniz implementation runs at about 450MB/s * The zlib implmentation, on OSX, runs at 25GB/s (seems to implement the same algorithm) * The bundled zlib implmentation (and also the one I found on Linux) runs at 1.4GB/s So this should be ~50 times faster for Cargo (which uses miniz), about 20 times faster for anyone using system zlib on Linux or the bundled zlib, and on part with OSX's zlib performance. --- .travis.yml | 9 +++ Cargo.toml | 1 + appveyor.yml | 10 ++- flate2-crc/Cargo.toml | 21 +++++ flate2-crc/benches/run.rs | 69 ++++++++++++++++ flate2-crc/build.rs | 36 +++++++++ flate2-crc/src/lib.rs | 103 ++++++++++++++++++++++++ flate2-crc/src/other.rs | 12 +++ flate2-crc/src/x86.rs | 160 ++++++++++++++++++++++++++++++++++++++ src/crc.rs | 27 +++++-- src/lib.rs | 1 + 11 files changed, 439 insertions(+), 10 deletions(-) create mode 100644 flate2-crc/Cargo.toml create mode 100644 flate2-crc/benches/run.rs create mode 100644 flate2-crc/build.rs create mode 100644 flate2-crc/src/lib.rs create mode 100644 flate2-crc/src/other.rs create mode 100644 flate2-crc/src/x86.rs diff --git a/.travis.yml b/.travis.yml index 14da3357b..2ed33e8d6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,10 +10,12 @@ matrix: - rust: stable script: cargo run --manifest-path systest/Cargo.toml + name: "systest" - rust: nightly install: rustup target add wasm32-unknown-unknown script: cargo build --target wasm32-unknown-unknown + name: "wasm" - rust: stable env: RUST_BACKEND=1 @@ -28,6 +30,7 @@ matrix: - cargo doc --no-deps --all-features after_success: - travis-cargo --only nightly doc-upload + name: "docs" allow_failures: - env: RUST_BACKEND=1 @@ -40,10 +43,16 @@ script: - cargo test --features tokio - cargo test --features 'tokio zlib' - cargo test --features zlib --no-default-features + - cargo test --manifest-path flate2-crc/Cargo.toml + - cargo test --release --manifest-path flate2-crc/Cargo.toml - cargo clean && cargo build - cargo doc --no-deps - cargo doc --no-deps --manifest-path=miniz-sys/Cargo.toml +branches: + only: + - master + env: global: secure: "PHVT7IaeP5nQQVwGHKwqCYBDp0QyetSlER7se2j2Xgfx+lw3Bu6VWH6VF04B636Gb0tHPN/sUCXSgGRcvDuy6XFOev4LfynoYxNKgHJYg2E34EP2QLwsFfnvE4iujaG3GJk3o935Y7OYGv2OP1HeG4Mv6JhQK0GLnNDBZQ65kWI=" diff --git a/Cargo.toml b/Cargo.toml index eb97da4ef..34bf99954 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ libz-sys = { version = "1.0", optional = true } tokio-io = { version = "0.1", optional = true } futures = { version = "0.1", optional = true } miniz_oxide_c_api = { version = "0.2", optional = true, features = ["no_c_export"]} +flate2-crc = { version = '0.1', path = 'flate2-crc' } [target.'cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))'.dependencies] miniz_oxide_c_api = { version = "0.2", features = ["no_c_export"] } diff --git a/appveyor.yml b/appveyor.yml index 0a140f7bc..f66ffed29 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,5 +20,11 @@ install: build: false test_script: - - cargo test --verbose --target %TARGET% - - cargo test --verbose --target %TARGET% --features tokio + - cargo test --target %TARGET% + - cargo test --target %TARGET% --features tokio + - cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml + - cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml --release + +branches: + only: + - master diff --git a/flate2-crc/Cargo.toml b/flate2-crc/Cargo.toml new file mode 100644 index 000000000..0c0ee797d --- /dev/null +++ b/flate2-crc/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "flate2-crc" +version = "0.1.0" +authors = ["Alex Crichton "] +license = "MIT/Apache-2.0" +repository = "https://github.com/alexcrichton/flate2-rs/tree/flate2-crc" +homepage = "https://github.com/alexcrichton/flate2-rs" +documentation = "https://docs.rs/flate2-crc" +description = """ +SIMD acceleration for CRC-32 checksums used in the gzip format +""" + +[dependencies] +cfg-if = "0.1.6" + +[dev-dependencies] +miniz-sys = { path = '../miniz-sys' } +rand = "0.6" +libz-sys = "1.0" +rayon = "1.0.3" +quickcheck = "0.7" diff --git a/flate2-crc/benches/run.rs b/flate2-crc/benches/run.rs new file mode 100644 index 000000000..ae49130e5 --- /dev/null +++ b/flate2-crc/benches/run.rs @@ -0,0 +1,69 @@ +#![feature(test)] + +extern crate flate2_crc; +extern crate rand; +extern crate test; +extern crate miniz_sys; +extern crate libz_sys; + +use rand::{thread_rng, RngCore}; + +fn flate2_crc(data: &[u8]) -> u32 { + flate2_crc::Hardware::detect().calculate(0, data, |crc, data| { + unsafe { + miniz_sys::mz_crc32(crc as u64, data.as_ptr(), data.len()) as u32 + } + }) +} + +fn miniz(data: &[u8]) -> u32 { + unsafe { + miniz_sys::mz_crc32(0, data.as_ptr(), data.len()) as u32 + } +} + +fn zlib(data: &[u8]) -> u32 { + unsafe { + libz_sys::crc32(0, data.as_ptr(), data.len() as u32) as u32 + } +} + +macro_rules! benches { + ($($f:ident => ($small:ident, $medium:ident, $large:ident),)*) => ($( + #[bench] + fn $small(b: &mut test::Bencher) { + let mut rng = thread_rng(); + let mut buf = vec![0u8; 8]; + rng.fill_bytes(&mut buf); + + b.bytes = 8; + b.iter(|| $f(&buf)); + } + + #[bench] + fn $medium(b: &mut test::Bencher) { + let mut rng = thread_rng(); + let mut buf = vec![0u8; 65_000]; + rng.fill_bytes(&mut buf); + + b.bytes = 65_000; + b.iter(|| $f(&buf)); + } + + #[bench] + fn $large(b: &mut test::Bencher) { + let mut rng = thread_rng(); + let mut buf = vec![0u8; 1_000_000]; + rng.fill_bytes(&mut buf); + + b.bytes = 1_000_000; + b.iter(|| $f(&buf)); + } + )*) +} + +benches! { + flate2_crc => (flate2_crc_8, flate2_crc_65000, flate2_crc_1000000), + miniz => (miniz_8, miniz_65000, miniz_1000000), + zlib => (zlib_8, zlib_65000, zlib_1000000), +} diff --git a/flate2-crc/build.rs b/flate2-crc/build.rs new file mode 100644 index 000000000..d03562018 --- /dev/null +++ b/flate2-crc/build.rs @@ -0,0 +1,36 @@ +use std::env; +use std::process::Command; +use std::str; + +fn main() { + println!("cargo:rerun-if-changed=build.rs"); + + let minor = match rustc_minor_version() { + Some(n) => n, + None => return, + }; + + if minor >= 27 { + println!("cargo:rustc-cfg=simd"); + } +} + +fn rustc_minor_version() -> Option { + macro_rules! otry { + ($e:expr) => { + match $e { + Some(e) => e, + None => return None, + } + }; + } + let rustc = otry!(env::var_os("RUSTC")); + let output = otry!(Command::new(rustc).arg("--version").output().ok()); + let version = otry!(str::from_utf8(&output.stdout).ok()); + let mut pieces = version.split('.'); + if pieces.next() != Some("rustc 1") { + return None; + } + otry!(pieces.next()).parse().ok() +} + diff --git a/flate2-crc/src/lib.rs b/flate2-crc/src/lib.rs new file mode 100644 index 000000000..f221519c3 --- /dev/null +++ b/flate2-crc/src/lib.rs @@ -0,0 +1,103 @@ +// Note that this isn't really intended to be a user-facing crate, that's +// `flate2::Crc` + +#[macro_use] +extern crate cfg_if; + +#[cfg(test)] +#[macro_use] +extern crate quickcheck; + +cfg_if! { + if #[cfg(not(simd))] { + mod other; + use self::other as imp; + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + mod x86; + use self::x86 as imp; + } else { + mod other; + use self::other as imp; + } +} + +#[derive(Debug)] +pub struct Hardware(bool); + +impl Hardware { + #[inline] + pub fn detect() -> Hardware { + Hardware(imp::detect()) + } + + #[inline] + pub fn calculate( + &self, + crc: u32, + data: &[u8], + fallback: fn(u32, &[u8]) -> u32, + ) -> u32 { + if self.0 { + unsafe { imp::calculate(crc, data, fallback) } + } else { + fallback(crc, data) + } + } +} + +#[cfg(test)] +mod tests { + extern crate miniz_sys; + extern crate rand; + extern crate rayon; + + use self::rand::Rng; + use self::rayon::prelude::*; + use super::Hardware; + + fn fallback(a: u32, b: &[u8]) -> u32 { + unsafe { + miniz_sys::mz_crc32(a as _, b.as_ptr(), b.len()) as u32 + } + } + + fn random_chunks(iters: usize, lo: usize, hi: usize) { + let hardware = Hardware::detect(); + + (0..iters) + .into_par_iter() + .for_each_with(Vec::new(), |data, _| { + let mut rng = rand::thread_rng(); + let init = rng.gen::(); + let len = rng.gen_range(lo, hi); + data.resize(len, 0u8); + rng.fill(&mut data[..]); + + assert_eq!( + fallback(init, &data), + hardware.calculate(init, &data, fallback), + ); + }); + } + + #[test] + fn random_small() { + random_chunks(1000, 0, 256); + } + + #[test] + fn random_med() { + random_chunks(1000, 256, 16 * 1024); + } + + #[test] + fn random_large() { + random_chunks(1000, 0, 1024 * 1024); + } + + quickcheck! { + fn prop(crc: u32, xs: Vec) -> bool { + fallback(crc, &xs) == Hardware::detect().calculate(crc, &xs, fallback) + } + } +} diff --git a/flate2-crc/src/other.rs b/flate2-crc/src/other.rs new file mode 100644 index 000000000..5e855aa20 --- /dev/null +++ b/flate2-crc/src/other.rs @@ -0,0 +1,12 @@ +#[inline] +pub fn detect() -> bool { + false +} + +pub unsafe fn calculate( + _crc: u32, + _data: &[u8], + _fallback: fn(u32, &[u8]) -> u32, +) -> u32 { + panic!() +} diff --git a/flate2-crc/src/x86.rs b/flate2-crc/src/x86.rs new file mode 100644 index 000000000..bf6595720 --- /dev/null +++ b/flate2-crc/src/x86.rs @@ -0,0 +1,160 @@ +//! SIMD-based implementation of crc-32 checksums for x86 hardware. +//! +//! This module is based on Intel's paper, "Fast CRC Computation for Generic +//! Polynomials Using PCLMULQDQ Instruction". The code is quite analagous to the +//! paper itself and only largely differs in one area. More information in the +//! comments below! + +#![allow(non_upper_case_globals)] + +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; +#[cfg(target_arch = "x86")] +use std::arch::x86::*; + +const K1: i64 = 0x154442bd4; +const K2: i64 = 0x1c6e41596; +const K3: i64 = 0x1751997d0; +const K4: i64 = 0x0ccaa009e; +const K5: i64 = 0x163cd6124; +const K6: i64 = 0x1db710640; + +const P_x: i64 = 0x1DB710641; +const U_prime: i64 = 0x1F7011641; + +pub fn detect() -> bool { + is_x86_feature_detected!("pclmulqdq") && + is_x86_feature_detected!("sse2") && + is_x86_feature_detected!("sse4.1") +} + +unsafe fn debug(s: &str, a: __m128i) -> __m128i { + if false { + union A { a: __m128i, b: [u8; 16] } + let x = A { a }.b; + print!(" {:20} | ", s); + for x in x.iter() { + print!("{:02x} ", x); + } + println!(); + } + return a +} + +#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")] +pub unsafe fn calculate( + crc: u32, + mut data: &[u8], + fallback: fn(u32, &[u8]) -> u32, +) -> u32 { + // In theory we can accelerate smaller chunks too, but for now just rely on + // the fallback implementation as it's too much hassle and doesn't seem too + // beneficial. + if data.len() < 128 { + return fallback(crc, data) + } + + // Step 1: fold by 4 loop + let mut x3 = get(&mut data); + let mut x2 = get(&mut data); + let mut x1 = get(&mut data); + let mut x0 = get(&mut data); + + // fold in our initial value, part of the incremental crc checksum + x3 = _mm_xor_si128(x3, _mm_cvtsi32_si128(!crc as i32)); + + let k1k2 = _mm_set_epi64x(K2, K1); + while data.len() >= 64 { + x3 = reduce128(x3, get(&mut data), k1k2); + x2 = reduce128(x2, get(&mut data), k1k2); + x1 = reduce128(x1, get(&mut data), k1k2); + x0 = reduce128(x0, get(&mut data), k1k2); + } + + let k3k4 = _mm_set_epi64x(K4, K3); + let mut x = reduce128(x3, x2, k3k4); + x = reduce128(x, x1, k3k4); + x = reduce128(x, x0, k3k4); + + // Step 2: fold by 1 loop + while data.len() >= 16 { + x = reduce128(x, get(&mut data), k3k4); + } + + debug("128 > 64 init", x); + + // Perform step 3, reduction from 128 bits to 64 bits. This is + // significantly different from the paper and basically doesn't follow it + // at all. It's not really clear why, but implementations of this algorithm + // in Chrome/Linux diverge in the same way. It is beyond me why this is + // different than the paper, maybe the paper has like errata or something? + // Unclear. + // + // It's also not clear to me what's actually happening here and/or why, but + // algebraically what's happening is: + // + // x = (x[0:63] • K4) ^ x[64:127] // 96 bit result + // x = ((x[0:31] as u64) • K5) ^ x[32:95] // 64 bit result + // + // It's... not clear to me what's going on here. The paper itself is pretty + // vague on this part but definitely uses different constants at least. + // It's not clear to me, reading the paper, where the xor operations are + // happening or why things are shifting around. This implementation... + // appears to work though! + drop(K6); + let x = _mm_xor_si128( + _mm_clmulepi64_si128(x, k3k4, 0x10), + _mm_srli_si128(x, 8), + ); + let x = _mm_xor_si128( + _mm_clmulepi64_si128( + _mm_and_si128(x, _mm_set_epi32(0, 0, 0, !0)), + _mm_set_epi64x(0, K5), + 0x00, + ), + _mm_srli_si128(x, 4), + ); + debug("128 > 64 xx", x); + + // Perform a Barrett reduction from our now 64 bits to 32 bits. The + // algorithm for this is described at the end of the paper, and note that + // this also implements the "bit reflected input" variant. + let pu = _mm_set_epi64x(U_prime, P_x); + + // T1(x) = ⌊(R(x) % x^32)⌋ • μ + let t1 = _mm_clmulepi64_si128( + _mm_and_si128(x, _mm_set_epi32(0, 0, 0, !0)), + pu, + 0x10, + ); + // T2(x) = ⌊(T1(x) % x^32)⌋ • P(x) + let t2 = _mm_clmulepi64_si128( + _mm_and_si128(t1, _mm_set_epi32(0, 0, 0, !0)), + pu, + 0x00, + ); + // We're doing the bit-reflected variant, so get the upper 32-bits of the + // 64-bit result instead of the lower 32-bits. + // + // C(x) = R(x) ^ T2(x) / x^32 + let c = _mm_extract_epi32(_mm_xor_si128(x, t2), 1) as u32; + + if data.len() > 0 { + fallback(!c, data) + } else { + !c + } +} + +unsafe fn reduce128(a: __m128i, b: __m128i, keys: __m128i) -> __m128i { + let t1 = _mm_clmulepi64_si128(a, keys, 0x00); + let t2 = _mm_clmulepi64_si128(a, keys, 0x11); + _mm_xor_si128(_mm_xor_si128(b, t1), t2) +} + +unsafe fn get(a: &mut &[u8]) -> __m128i { + debug_assert!(a.len() >= 16); + let r = _mm_loadu_si128(a.as_ptr() as *const __m128i); + *a = &a[16..]; + return r +} diff --git a/src/crc.rs b/src/crc.rs index 0d621a921..186d050fe 100644 --- a/src/crc.rs +++ b/src/crc.rs @@ -2,6 +2,8 @@ use std::io::prelude::*; use std::io; + +use flate2_crc::Hardware; use libc; use ffi; @@ -11,8 +13,9 @@ use ffi; /// [`CrcReader`]: struct.CrcReader.html #[derive(Debug)] pub struct Crc { - crc: libc::c_ulong, + crc: u32, amt: u32, + hardware: Hardware, } /// A wrapper around a [`Read`] that calculates the CRC. @@ -27,10 +30,10 @@ pub struct CrcReader { impl Crc { /// Create a new CRC. pub fn new() -> Crc { - Crc { crc: 0, amt: 0 } + Crc { crc: 0, amt: 0, hardware: Hardware::detect() } } - /// bla + /// Returns the current crc32 checksum. pub fn sum(&self) -> u32 { self.crc as u32 } @@ -44,7 +47,15 @@ impl Crc { /// Update the CRC with the bytes in `data`. pub fn update(&mut self, data: &[u8]) { self.amt = self.amt.wrapping_add(data.len() as u32); - self.crc = unsafe { ffi::mz_crc32(self.crc, data.as_ptr(), data.len() as libc::size_t) }; + self.crc = self.hardware.calculate(self.crc, data, |crc, data| { + unsafe { + ffi::mz_crc32( + crc as libc::c_ulong, + data.as_ptr(), + data.len() as libc::size_t, + ) as u32 + } + }); } /// Reset the CRC. @@ -57,10 +68,10 @@ impl Crc { pub fn combine(&mut self, additional_crc: &Crc) { self.crc = unsafe { ffi::mz_crc32_combine( - self.crc as ::libc::c_ulong, - additional_crc.crc as ::libc::c_ulong, - additional_crc.amt as ::libc::off_t, - ) + self.crc as libc::c_ulong, + additional_crc.crc as libc::c_ulong, + additional_crc.amt as libc::off_t, + ) as u32 }; self.amt += additional_crc.amt; } diff --git a/src/lib.rs b/src/lib.rs index 550e68b60..163575d0f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,6 +79,7 @@ #![allow(trivial_numeric_casts)] #![cfg_attr(test, deny(warnings))] +extern crate flate2_crc; #[cfg(feature = "tokio")] extern crate futures; #[cfg(not(all(target_arch = "wasm32", not(target_os = "emscripten"))))]