From 9b4459213c7ce5432d53be92cd2cf56d1221740e Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sat, 17 Nov 2018 22:28:30 -0800
Subject: [PATCH] Implement a SIMD fast path for CRC checksums

Recently I was profiling Cargo's extraction of tarballs and was quite
surprised to learn that 15% of the execution time of tarball extraction
was entirely crc32 checksum calculations in miniz. This was quite a
surprise to me and led me down a long rabbit hole of figuring out how to
speed this up!

It turns out Intel's written a paper, "Fast CRC Computation for Generic
Polynomials Using PCLMULQDQ Instruction", which describes how to
implement a CRC-32 value using hardware instructions. Note that these
are not the hardware CRC instructions, which I think are a different
algorithm.

This commit implements this paper in Rust, looking to a few other
external implementations for guidance as well. Overall the results are
quite promising, and I'm pretty confident in the correctness of this as
well. Current results look like:

* This SIMD implementation runs at about 25GB/s
* The miniz implementation runs at about 450MB/s
* The zlib implmentation, on OSX, runs at 25GB/s (seems to implement the
  same algorithm)
* The bundled zlib implmentation (and also the one I found on Linux)
  runs at 1.4GB/s

So this should be ~50 times faster for Cargo (which uses miniz), about
20 times faster for anyone using system zlib on Linux or the bundled
zlib, and on part with OSX's zlib performance.
---
 .travis.yml               |   9 +++
 Cargo.toml                |   1 +
 appveyor.yml              |  10 ++-
 flate2-crc/Cargo.toml     |  21 +++++
 flate2-crc/benches/run.rs |  69 ++++++++++++++++
 flate2-crc/build.rs       |  36 +++++++++
 flate2-crc/src/lib.rs     | 103 ++++++++++++++++++++++++
 flate2-crc/src/other.rs   |  12 +++
 flate2-crc/src/x86.rs     | 160 ++++++++++++++++++++++++++++++++++++++
 src/crc.rs                |  27 +++++--
 src/lib.rs                |   1 +
 11 files changed, 439 insertions(+), 10 deletions(-)
 create mode 100644 flate2-crc/Cargo.toml
 create mode 100644 flate2-crc/benches/run.rs
 create mode 100644 flate2-crc/build.rs
 create mode 100644 flate2-crc/src/lib.rs
 create mode 100644 flate2-crc/src/other.rs
 create mode 100644 flate2-crc/src/x86.rs

diff --git a/.travis.yml b/.travis.yml
index 14da3357b..2ed33e8d6 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -10,10 +10,12 @@ matrix:
 
     - rust: stable
       script: cargo run --manifest-path systest/Cargo.toml
+      name: "systest"
 
     - rust: nightly
       install: rustup target add wasm32-unknown-unknown
       script: cargo build --target wasm32-unknown-unknown
+      name: "wasm"
 
     - rust: stable
       env: RUST_BACKEND=1
@@ -28,6 +30,7 @@ matrix:
         - cargo doc --no-deps --all-features
       after_success:
         - travis-cargo --only nightly doc-upload
+      name: "docs"
 
   allow_failures:
     - env: RUST_BACKEND=1
@@ -40,10 +43,16 @@ script:
   - cargo test --features tokio
   - cargo test --features 'tokio zlib'
   - cargo test --features zlib --no-default-features
+  - cargo test --manifest-path flate2-crc/Cargo.toml
+  - cargo test --release --manifest-path flate2-crc/Cargo.toml
   - cargo clean && cargo build
   - cargo doc --no-deps
   - cargo doc --no-deps --manifest-path=miniz-sys/Cargo.toml
 
+branches:
+  only:
+    - master
+
 env:
   global:
     secure: "PHVT7IaeP5nQQVwGHKwqCYBDp0QyetSlER7se2j2Xgfx+lw3Bu6VWH6VF04B636Gb0tHPN/sUCXSgGRcvDuy6XFOev4LfynoYxNKgHJYg2E34EP2QLwsFfnvE4iujaG3GJk3o935Y7OYGv2OP1HeG4Mv6JhQK0GLnNDBZQ65kWI="
diff --git a/Cargo.toml b/Cargo.toml
index eb97da4ef..34bf99954 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,6 +26,7 @@ libz-sys = { version = "1.0", optional = true }
 tokio-io = { version = "0.1", optional = true }
 futures = { version = "0.1", optional = true }
 miniz_oxide_c_api = { version = "0.2", optional = true, features = ["no_c_export"]}
+flate2-crc = { version = '0.1', path = 'flate2-crc' }
 
 [target.'cfg(all(target_arch = "wasm32", not(target_os = "emscripten")))'.dependencies]
 miniz_oxide_c_api = { version = "0.2", features = ["no_c_export"] }
diff --git a/appveyor.yml b/appveyor.yml
index 0a140f7bc..f66ffed29 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -20,5 +20,11 @@ install:
 build: false
 
 test_script:
-  - cargo test --verbose --target %TARGET%
-  - cargo test --verbose --target %TARGET% --features tokio
+  - cargo test --target %TARGET%
+  - cargo test --target %TARGET% --features tokio
+  - cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml
+  - cargo test --target %TARGET% --manifest-path flate2-crc/Cargo.toml --release
+
+branches:
+  only:
+    - master
diff --git a/flate2-crc/Cargo.toml b/flate2-crc/Cargo.toml
new file mode 100644
index 000000000..0c0ee797d
--- /dev/null
+++ b/flate2-crc/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "flate2-crc"
+version = "0.1.0"
+authors = ["Alex Crichton <alex@alexcrichton.com>"]
+license = "MIT/Apache-2.0"
+repository = "https://github.com/alexcrichton/flate2-rs/tree/flate2-crc"
+homepage = "https://github.com/alexcrichton/flate2-rs"
+documentation = "https://docs.rs/flate2-crc"
+description = """
+SIMD acceleration for CRC-32 checksums used in the gzip format
+"""
+
+[dependencies]
+cfg-if = "0.1.6"
+
+[dev-dependencies]
+miniz-sys = { path = '../miniz-sys' }
+rand = "0.6"
+libz-sys = "1.0"
+rayon = "1.0.3"
+quickcheck = "0.7"
diff --git a/flate2-crc/benches/run.rs b/flate2-crc/benches/run.rs
new file mode 100644
index 000000000..ae49130e5
--- /dev/null
+++ b/flate2-crc/benches/run.rs
@@ -0,0 +1,69 @@
+#![feature(test)]
+
+extern crate flate2_crc;
+extern crate rand;
+extern crate test;
+extern crate miniz_sys;
+extern crate libz_sys;
+
+use rand::{thread_rng, RngCore};
+
+fn flate2_crc(data: &[u8]) -> u32 {
+    flate2_crc::Hardware::detect().calculate(0, data, |crc, data| {
+        unsafe {
+            miniz_sys::mz_crc32(crc as u64, data.as_ptr(), data.len()) as u32
+        }
+    })
+}
+
+fn miniz(data: &[u8]) -> u32 {
+    unsafe {
+        miniz_sys::mz_crc32(0, data.as_ptr(), data.len()) as u32
+    }
+}
+
+fn zlib(data: &[u8]) -> u32 {
+    unsafe {
+        libz_sys::crc32(0, data.as_ptr(), data.len() as u32) as u32
+    }
+}
+
+macro_rules! benches {
+    ($($f:ident => ($small:ident, $medium:ident, $large:ident),)*) => ($(
+        #[bench]
+        fn $small(b: &mut test::Bencher) {
+            let mut rng = thread_rng();
+            let mut buf = vec![0u8; 8];
+            rng.fill_bytes(&mut buf);
+
+            b.bytes = 8;
+            b.iter(|| $f(&buf));
+        }
+
+        #[bench]
+        fn $medium(b: &mut test::Bencher) {
+            let mut rng = thread_rng();
+            let mut buf = vec![0u8; 65_000];
+            rng.fill_bytes(&mut buf);
+
+            b.bytes = 65_000;
+            b.iter(|| $f(&buf));
+        }
+
+        #[bench]
+        fn $large(b: &mut test::Bencher) {
+            let mut rng = thread_rng();
+            let mut buf = vec![0u8; 1_000_000];
+            rng.fill_bytes(&mut buf);
+
+            b.bytes = 1_000_000;
+            b.iter(|| $f(&buf));
+        }
+    )*)
+}
+
+benches! {
+    flate2_crc => (flate2_crc_8, flate2_crc_65000, flate2_crc_1000000),
+    miniz => (miniz_8, miniz_65000, miniz_1000000),
+    zlib => (zlib_8, zlib_65000, zlib_1000000),
+}
diff --git a/flate2-crc/build.rs b/flate2-crc/build.rs
new file mode 100644
index 000000000..d03562018
--- /dev/null
+++ b/flate2-crc/build.rs
@@ -0,0 +1,36 @@
+use std::env;
+use std::process::Command;
+use std::str;
+
+fn main() {
+    println!("cargo:rerun-if-changed=build.rs");
+
+    let minor = match rustc_minor_version() {
+        Some(n) => n,
+        None => return,
+    };
+
+    if minor >= 27 {
+        println!("cargo:rustc-cfg=simd");
+    }
+}
+
+fn rustc_minor_version() -> Option<u32> {
+    macro_rules! otry {
+        ($e:expr) => {
+            match $e {
+                Some(e) => e,
+                None => return None,
+            }
+        };
+    }
+    let rustc = otry!(env::var_os("RUSTC"));
+    let output = otry!(Command::new(rustc).arg("--version").output().ok());
+    let version = otry!(str::from_utf8(&output.stdout).ok());
+    let mut pieces = version.split('.');
+    if pieces.next() != Some("rustc 1") {
+        return None;
+    }
+    otry!(pieces.next()).parse().ok()
+}
+
diff --git a/flate2-crc/src/lib.rs b/flate2-crc/src/lib.rs
new file mode 100644
index 000000000..f221519c3
--- /dev/null
+++ b/flate2-crc/src/lib.rs
@@ -0,0 +1,103 @@
+// Note that this isn't really intended to be a user-facing crate, that's
+// `flate2::Crc`
+
+#[macro_use]
+extern crate cfg_if;
+
+#[cfg(test)]
+#[macro_use]
+extern crate quickcheck;
+
+cfg_if! {
+    if #[cfg(not(simd))] {
+        mod other;
+        use self::other as imp;
+    } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+        mod x86;
+        use self::x86 as imp;
+    } else {
+        mod other;
+        use self::other as imp;
+    }
+}
+
+#[derive(Debug)]
+pub struct Hardware(bool);
+
+impl Hardware {
+    #[inline]
+    pub fn detect() -> Hardware {
+        Hardware(imp::detect())
+    }
+
+    #[inline]
+    pub fn calculate(
+        &self,
+        crc: u32,
+        data: &[u8],
+        fallback: fn(u32, &[u8]) -> u32,
+    ) -> u32 {
+        if self.0 {
+            unsafe { imp::calculate(crc, data, fallback) }
+        } else {
+            fallback(crc, data)
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    extern crate miniz_sys;
+    extern crate rand;
+    extern crate rayon;
+
+    use self::rand::Rng;
+    use self::rayon::prelude::*;
+    use super::Hardware;
+
+    fn fallback(a: u32, b: &[u8]) -> u32 {
+        unsafe {
+            miniz_sys::mz_crc32(a as _, b.as_ptr(), b.len()) as u32
+        }
+    }
+
+    fn random_chunks(iters: usize, lo: usize, hi: usize) {
+        let hardware = Hardware::detect();
+
+        (0..iters)
+            .into_par_iter()
+            .for_each_with(Vec::new(), |data, _| {
+                let mut rng = rand::thread_rng();
+                let init = rng.gen::<u32>();
+                let len = rng.gen_range(lo, hi);
+                data.resize(len, 0u8);
+                rng.fill(&mut data[..]);
+
+                assert_eq!(
+                    fallback(init, &data),
+                    hardware.calculate(init, &data, fallback),
+                );
+            });
+    }
+
+    #[test]
+    fn random_small() {
+        random_chunks(1000, 0, 256);
+    }
+
+    #[test]
+    fn random_med() {
+        random_chunks(1000, 256, 16 * 1024);
+    }
+
+    #[test]
+    fn random_large() {
+        random_chunks(1000, 0, 1024 * 1024);
+    }
+
+    quickcheck! {
+        fn prop(crc: u32, xs: Vec<u8>) -> bool {
+            fallback(crc, &xs) == Hardware::detect().calculate(crc, &xs, fallback)
+        }
+    }
+}
diff --git a/flate2-crc/src/other.rs b/flate2-crc/src/other.rs
new file mode 100644
index 000000000..5e855aa20
--- /dev/null
+++ b/flate2-crc/src/other.rs
@@ -0,0 +1,12 @@
+#[inline]
+pub fn detect() -> bool {
+    false
+}
+
+pub unsafe fn calculate(
+    _crc: u32,
+    _data: &[u8],
+    _fallback: fn(u32, &[u8]) -> u32,
+) -> u32 {
+    panic!()
+}
diff --git a/flate2-crc/src/x86.rs b/flate2-crc/src/x86.rs
new file mode 100644
index 000000000..bf6595720
--- /dev/null
+++ b/flate2-crc/src/x86.rs
@@ -0,0 +1,160 @@
+//! SIMD-based implementation of crc-32 checksums for x86 hardware.
+//!
+//! This module is based on Intel's paper, "Fast CRC Computation for Generic
+//! Polynomials Using PCLMULQDQ Instruction". The code is quite analagous to the
+//! paper itself and only largely differs in one area. More information in the
+//! comments below!
+
+#![allow(non_upper_case_globals)]
+
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+
+const K1: i64 = 0x154442bd4;
+const K2: i64 = 0x1c6e41596;
+const K3: i64 = 0x1751997d0;
+const K4: i64 = 0x0ccaa009e;
+const K5: i64 = 0x163cd6124;
+const K6: i64 = 0x1db710640;
+
+const P_x: i64 = 0x1DB710641;
+const U_prime: i64 = 0x1F7011641;
+
+pub fn detect() -> bool {
+    is_x86_feature_detected!("pclmulqdq") &&
+        is_x86_feature_detected!("sse2") &&
+        is_x86_feature_detected!("sse4.1")
+}
+
+unsafe fn debug(s: &str, a: __m128i) -> __m128i {
+    if false {
+        union A { a: __m128i, b: [u8; 16] }
+        let x = A { a }.b;
+        print!(" {:20} | ", s);
+        for x in x.iter() {
+            print!("{:02x} ", x);
+        }
+        println!();
+    }
+    return a
+}
+
+#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
+pub unsafe fn calculate(
+    crc: u32,
+    mut data: &[u8],
+    fallback: fn(u32, &[u8]) -> u32,
+) -> u32 {
+    // In theory we can accelerate smaller chunks too, but for now just rely on
+    // the fallback implementation as it's too much hassle and doesn't seem too
+    // beneficial.
+    if data.len() < 128 {
+        return fallback(crc, data)
+    }
+
+    // Step 1: fold by 4 loop
+    let mut x3 = get(&mut data);
+    let mut x2 = get(&mut data);
+    let mut x1 = get(&mut data);
+    let mut x0 = get(&mut data);
+
+    // fold in our initial value, part of the incremental crc checksum
+    x3 = _mm_xor_si128(x3, _mm_cvtsi32_si128(!crc as i32));
+
+    let k1k2 = _mm_set_epi64x(K2, K1);
+    while data.len() >= 64 {
+        x3 = reduce128(x3, get(&mut data), k1k2);
+        x2 = reduce128(x2, get(&mut data), k1k2);
+        x1 = reduce128(x1, get(&mut data), k1k2);
+        x0 = reduce128(x0, get(&mut data), k1k2);
+    }
+
+    let k3k4 = _mm_set_epi64x(K4, K3);
+    let mut x = reduce128(x3, x2, k3k4);
+    x = reduce128(x, x1, k3k4);
+    x = reduce128(x, x0, k3k4);
+
+    // Step 2: fold by 1 loop
+    while data.len() >= 16 {
+        x = reduce128(x, get(&mut data), k3k4);
+    }
+
+    debug("128 > 64 init", x);
+
+    // Perform step 3, reduction from 128 bits to 64 bits. This is
+    // significantly different from the paper and basically doesn't follow it
+    // at all. It's not really clear why, but implementations of this algorithm
+    // in Chrome/Linux diverge in the same way. It is beyond me why this is
+    // different than the paper, maybe the paper has like errata or something?
+    // Unclear.
+    //
+    // It's also not clear to me what's actually happening here and/or why, but
+    // algebraically what's happening is:
+    //
+    // x = (x[0:63] • K4) ^ x[64:127]           // 96 bit result
+    // x = ((x[0:31] as u64) • K5) ^ x[32:95]   // 64 bit result
+    //
+    // It's... not clear to me what's going on here. The paper itself is pretty
+    // vague on this part but definitely uses different constants at least.
+    // It's not clear to me, reading the paper, where the xor operations are
+    // happening or why things are shifting around. This implementation...
+    // appears to work though!
+    drop(K6);
+    let x = _mm_xor_si128(
+        _mm_clmulepi64_si128(x, k3k4, 0x10),
+        _mm_srli_si128(x, 8),
+    );
+    let x = _mm_xor_si128(
+        _mm_clmulepi64_si128(
+            _mm_and_si128(x, _mm_set_epi32(0, 0, 0, !0)),
+            _mm_set_epi64x(0, K5),
+            0x00,
+        ),
+        _mm_srli_si128(x, 4),
+    );
+    debug("128 > 64 xx", x);
+
+    // Perform a Barrett reduction from our now 64 bits to 32 bits. The
+    // algorithm for this is described at the end of the paper, and note that
+    // this also implements the "bit reflected input" variant.
+    let pu = _mm_set_epi64x(U_prime, P_x);
+
+    // T1(x) = ⌊(R(x) % x^32)⌋ • μ
+    let t1 = _mm_clmulepi64_si128(
+        _mm_and_si128(x, _mm_set_epi32(0, 0, 0, !0)),
+        pu,
+        0x10,
+    );
+    // T2(x) = ⌊(T1(x) % x^32)⌋ • P(x)
+    let t2 = _mm_clmulepi64_si128(
+        _mm_and_si128(t1, _mm_set_epi32(0, 0, 0, !0)),
+        pu,
+        0x00,
+    );
+    // We're doing the bit-reflected variant, so get the upper 32-bits of the
+    // 64-bit result instead of the lower 32-bits.
+    //
+    // C(x) = R(x) ^ T2(x) / x^32
+    let c = _mm_extract_epi32(_mm_xor_si128(x, t2), 1) as u32;
+
+    if data.len() > 0 {
+        fallback(!c, data)
+    } else {
+        !c
+    }
+}
+
+unsafe fn reduce128(a: __m128i, b: __m128i, keys: __m128i) -> __m128i {
+    let t1 = _mm_clmulepi64_si128(a, keys, 0x00);
+    let t2 = _mm_clmulepi64_si128(a, keys, 0x11);
+    _mm_xor_si128(_mm_xor_si128(b, t1), t2)
+}
+
+unsafe fn get(a: &mut &[u8]) -> __m128i {
+    debug_assert!(a.len() >= 16);
+    let r = _mm_loadu_si128(a.as_ptr() as *const __m128i);
+    *a = &a[16..];
+    return r
+}
diff --git a/src/crc.rs b/src/crc.rs
index 0d621a921..186d050fe 100644
--- a/src/crc.rs
+++ b/src/crc.rs
@@ -2,6 +2,8 @@
 
 use std::io::prelude::*;
 use std::io;
+
+use flate2_crc::Hardware;
 use libc;
 
 use ffi;
@@ -11,8 +13,9 @@ use ffi;
 /// [`CrcReader`]: struct.CrcReader.html
 #[derive(Debug)]
 pub struct Crc {
-    crc: libc::c_ulong,
+    crc: u32,
     amt: u32,
+    hardware: Hardware,
 }
 
 /// A wrapper around a [`Read`] that calculates the CRC.
@@ -27,10 +30,10 @@ pub struct CrcReader<R> {
 impl Crc {
     /// Create a new CRC.
     pub fn new() -> Crc {
-        Crc { crc: 0, amt: 0 }
+        Crc { crc: 0, amt: 0, hardware: Hardware::detect() }
     }
 
-    /// bla
+    /// Returns the current crc32 checksum.
     pub fn sum(&self) -> u32 {
         self.crc as u32
     }
@@ -44,7 +47,15 @@ impl Crc {
     /// Update the CRC with the bytes in `data`.
     pub fn update(&mut self, data: &[u8]) {
         self.amt = self.amt.wrapping_add(data.len() as u32);
-        self.crc = unsafe { ffi::mz_crc32(self.crc, data.as_ptr(), data.len() as libc::size_t) };
+        self.crc = self.hardware.calculate(self.crc, data, |crc, data| {
+            unsafe {
+                ffi::mz_crc32(
+                    crc as libc::c_ulong,
+                    data.as_ptr(),
+                    data.len() as libc::size_t,
+                ) as u32
+            }
+        });
     }
 
     /// Reset the CRC.
@@ -57,10 +68,10 @@ impl Crc {
     pub fn combine(&mut self, additional_crc: &Crc) {
         self.crc = unsafe {
             ffi::mz_crc32_combine(
-                self.crc as ::libc::c_ulong,
-                additional_crc.crc as ::libc::c_ulong,
-                additional_crc.amt as ::libc::off_t,
-            )
+                self.crc as libc::c_ulong,
+                additional_crc.crc as libc::c_ulong,
+                additional_crc.amt as libc::off_t,
+            ) as u32
         };
         self.amt += additional_crc.amt;
     }
diff --git a/src/lib.rs b/src/lib.rs
index 550e68b60..163575d0f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,6 +79,7 @@
 #![allow(trivial_numeric_casts)]
 #![cfg_attr(test, deny(warnings))]
 
+extern crate flate2_crc;
 #[cfg(feature = "tokio")]
 extern crate futures;
 #[cfg(not(all(target_arch = "wasm32", not(target_os = "emscripten"))))]