From 033470f26ac356a032a6d5c9ce26e03048658bbe Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 15:02:36 -0500
Subject: [PATCH 1/6] feat: add VAES support

---
 .gitignore        |   1 +
 Cargo.toml        |   5 ++-
 src/aes_hash.rs   |  58 +++++++++++++++++++++++++++
 src/convert.rs    |  14 +++++++
 src/lib.rs        |   1 +
 src/operations.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++-
 tests/bench.rs    |  10 +++--
 7 files changed, 183 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index fc89f1b..490a548 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .idea/
+.vscode/
 Cargo.lock
 target
diff --git a/Cargo.toml b/Cargo.toml
index d37c2de..e2989fc 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -47,6 +47,9 @@ atomic-polyfill = [ "dep:atomic-polyfill", "once_cell/atomic-polyfill"]
 # Nightly-only support for AES intrinsics on 32-bit ARM
 nightly-arm-aes = []
 
+# Nightly-only support for VAES intrinsics with 256 SIMD registers
+vaes = []
+
 [[bench]]
 name = "ahash"
 path = "tests/bench.rs"
@@ -84,7 +87,7 @@ serde = { version = "1.0.117", optional = true }
 cfg-if = "1.0"
 atomic-polyfill = { version="1.0.1", optional=true}
 getrandom = { version = "0.2.7", optional = true }
-zerocopy = { version = "0.7.20", default-features = false, features = ["simd"] }
+zerocopy = { version = "0.7.20", default-features = false, features = ["simd", "derive"] }
 
 [target.'cfg(not(all(target_arch = "arm", target_os = "none")))'.dependencies]
 once_cell = { version = "1.18.0", default-features = false, features = ["alloc"] }
diff --git a/src/aes_hash.rs b/src/aes_hash.rs
index 0b9a1d4..42c27b0 100644
--- a/src/aes_hash.rs
+++ b/src/aes_hash.rs
@@ -22,6 +22,61 @@ pub struct AHasher {
     key: u128,
 }
 
+#[cfg(any(
+    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)),
+    all(target_arch = "aarch64", target_feature = "aes", not(miri)),
+    all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
+))]
+fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
+    use zerocopy::transmute;
+    let tail = data.read_last_u128x8();
+    let current = [
+        aesenc(hasher.key, tail[0]),
+        aesdec(hasher.key, tail[1]),
+        aesenc(hasher.key, tail[2]),
+        aesdec(hasher.key, tail[3]),
+        aesenc(hasher.key, tail[4]),
+        aesdec(hasher.key, tail[5]),
+        aesenc(hasher.key, tail[6]),
+        aesdec(hasher.key, tail[7]),
+    ];
+    let tail : [Vector256;4] = transmute!(tail);
+    let mut current : [Vector256;4] = transmute!(current);
+    let mut sum : [Vector256; 2] = [
+        transmute!([hasher.key, !hasher.key]),
+        transmute!([hasher.key, !hasher.key]),
+    ];
+    sum[0] = add_by_64s_vec256(sum[0], tail[0]);
+    sum[0] = add_by_64s_vec256(sum[0], tail[1]);
+    sum[1] = shuffle_and_add_vec256(sum[1], tail[2]);
+    sum[1] = shuffle_and_add_vec256(sum[1], tail[3]);
+    while data.len() > 128 {
+        let (blocks, rest) = data.read_u128x8();
+        let blocks : [Vector256; 4] = transmute!(blocks);
+        current[0] = aesdec_vec256(current[0], blocks[0]);
+        current[1] = aesdec_vec256(current[1], blocks[1]);
+        current[2] = aesdec_vec256(current[2], blocks[2]);
+        current[3] = aesdec_vec256(current[3], blocks[3]);
+        sum[0] = shuffle_and_add_vec256(sum[0], blocks[0]);
+        sum[0] = shuffle_and_add_vec256(sum[0], blocks[1]);
+        sum[1] = shuffle_and_add_vec256(sum[1], blocks[2]);
+        sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]);
+        *data = rest;
+    }
+    let current : [[u128;2]; 4] = transmute!(current);
+    let sum : [[u128;2]; 2] = transmute!(sum);
+    hasher.hash_in_2(
+        aesdec(current[0][0], current[0][1]),
+        aesdec(current[1][0], current[1][1]),
+    );
+    hasher.hash_in(add_by_64s(sum[0][0].convert(), sum[0][1].convert()).convert());
+    hasher.hash_in_2(
+        aesdec(current[2][0], current[2][1]),
+        aesdec(current[3][0], current[3][1]),
+    );
+    hasher.hash_in(add_by_64s(sum[1][0].convert(), sum[1][1].convert()).convert());
+}
+
 impl AHasher {
     /// Creates a new hasher keyed to the provided keys.
     ///
@@ -160,6 +215,9 @@ impl Hasher for AHasher {
             self.hash_in(value.convert());
         } else {
             if data.len() > 32 {
+                if data.len() > 128 {
+                    return hash_batch_128b(&mut data, self);
+                } 
                 if data.len() > 64 {
                     let tail = data.read_last_u128x4();
                     let mut current: [u128; 4] = [self.key; 4];
diff --git a/src/convert.rs b/src/convert.rs
index 712eae1..7541824 100644
--- a/src/convert.rs
+++ b/src/convert.rs
@@ -19,6 +19,7 @@ macro_rules! convert {
     };
 }
 
+convert!([u128; 8], [u8; 128]);
 convert!([u128; 4], [u64; 8]);
 convert!([u128; 4], [u32; 16]);
 convert!([u128; 4], [u16; 32]);
@@ -79,12 +80,14 @@ pub(crate) trait ReadFromSlice {
     fn read_u128(&self) -> (u128, &[u8]);
     fn read_u128x2(&self) -> ([u128; 2], &[u8]);
     fn read_u128x4(&self) -> ([u128; 4], &[u8]);
+    fn read_u128x8(&self) -> ([u128; 8], &[u8]);
     fn read_last_u16(&self) -> u16;
     fn read_last_u32(&self) -> u32;
     fn read_last_u64(&self) -> u64;
     fn read_last_u128(&self) -> u128;
     fn read_last_u128x2(&self) -> [u128; 2];
     fn read_last_u128x4(&self) -> [u128; 4];
+    fn read_last_u128x8(&self) -> [u128; 8];
 }
 
 impl ReadFromSlice for [u8] {
@@ -124,6 +127,12 @@ impl ReadFromSlice for [u8] {
         (as_array!(value, 64).convert(), rest)
     }
 
+    #[inline(always)]
+    fn read_u128x8(&self) -> ([u128; 8], &[u8]) {
+        let (value, rest) = self.split_at(128);
+        (as_array!(value, 128).convert(), rest)
+    }
+
     #[inline(always)]
     fn read_last_u16(&self) -> u16 {
         let (_, value) = self.split_at(self.len() - 2);
@@ -159,4 +168,9 @@ impl ReadFromSlice for [u8] {
         let (_, value) = self.split_at(self.len() - 64);
         as_array!(value, 64).convert()
     }
+
+    fn read_last_u128x8(&self) -> [u128; 8] {
+        let (_, value) = self.split_at(self.len() - 128);
+        as_array!(value, 128).convert()
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 2086513..5f7551e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -99,6 +99,7 @@ Note the import of [HashMapExt]. This is needed for the constructor.
 #![cfg_attr(all(not(test), not(feature = "std")), no_std)]
 #![cfg_attr(feature = "specialize", feature(min_specialization))]
 #![cfg_attr(feature = "nightly-arm-aes", feature(stdarch_arm_neon_intrinsics))]
+#![cfg_attr(feature = "vaes", feature(stdsimd))]
 
 #[macro_use]
 mod convert;
diff --git a/src/operations.rs b/src/operations.rs
index a420587..969d60b 100644
--- a/src/operations.rs
+++ b/src/operations.rs
@@ -180,11 +180,109 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) {
     }
 }
 
+
+#[cfg(any(
+    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)),
+    all(target_arch = "aarch64", target_feature = "aes", not(miri)),
+    all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
+))]
+mod vaes {
+use super::*;
+#[repr(C, align(32))]
+#[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)]
+pub(crate) struct Vector256([u128;2]);
+
+pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 {
+    cfg_if::cfg_if!{
+        if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"), 
+                target_feature = "vaes", 
+                feature = "vaes", 
+                not(miri)
+            ))] {
+            use core::arch::x86_64::*;
+            unsafe {
+                transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor)))
+            }
+        }
+        else {
+            Vector256(
+                [
+                    aesdec(value.0[0], xor.0[0]),
+                    aesdec(value.0[1], xor.0[1]),
+                ]
+            )
+        }
+    }
+}
+
+pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 {
+    cfg_if::cfg_if!{
+        if #[cfg(all(
+            any(target_arch = "x86", target_arch = "x86_64"), 
+            target_feature = "vaes", 
+            feature = "vaes", 
+            not(miri)
+        ))] {
+            use core::arch::x86_64::*;
+            unsafe {
+                transmute!(_mm256_add_epi64(transmute!(a), transmute!(b)))
+            }
+        }
+        else {
+            Vector256(
+                [
+                    add_by_64s(a.0[0], b.0[0]),
+                    add_by_64s(a.0[1], b.0[1]),
+                ]
+            )
+        }
+    }
+}
+
+pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 {
+    cfg_if::cfg_if!{
+        if #[cfg(all(
+            any(target_arch = "x86", target_arch = "x86_64"), 
+            target_feature = "vaes", 
+            feature = "vaes", 
+            not(miri)
+        ))] {
+            unsafe {
+                use core::arch::x86_64::*;
+                let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]);
+                transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask))
+            }
+        }
+        else {
+            Vector256(
+                [
+                    shuffle(value.0[0]),
+                    shuffle(value.0[1]),
+                ]
+            )
+        }
+    }
+}
+
+pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 {
+    add_by_64s_vec256(shuffle_vec256(base), to_add)
+}
+}
+
+#[cfg(any(
+    all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)),
+    all(target_arch = "aarch64", target_feature = "aes", not(miri)),
+    all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
+))]
+pub(crate) use vaes::*;
+
+
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::convert::Convert;
-
+    
     // This is code to search for the shuffle constant
     //
     //thread_local! { static MASK: Cell<u128> = Cell::new(0); }
diff --git a/tests/bench.rs b/tests/bench.rs
index e038ba4..245455f 100644
--- a/tests/bench.rs
+++ b/tests/bench.rs
@@ -56,10 +56,11 @@ fn seahash<H: Hash>(b: &H) -> u64 {
     hasher.finish()
 }
 
-const STRING_LENGTHS: [u32; 12] = [1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024];
+const STRING_LENGTHS: &'static [u32; 12] = &[1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024];
+const WIDER_STRINGS_LENGTHS: &'static [u32] = &[1, 64, 1024, 4096, 5261, 16384, 19997];
 
-fn gen_strings() -> Vec<String> {
-    STRING_LENGTHS
+fn gen_strings(lengths: &[u32]) -> Vec<String> {
+    lengths
         .iter()
         .map(|len| {
             let mut string = String::default();
@@ -83,7 +84,8 @@ macro_rules! bench_inputs {
         $group.bench_function("u32", |b| b.iter_batched(|| rng.gen::<u32>(), |v| $hash(&v), size));
         $group.bench_function("u64", |b| b.iter_batched(|| rng.gen::<u64>(), |v| $hash(&v), size));
         $group.bench_function("u128", |b| b.iter_batched(|| rng.gen::<u128>(), |v| $hash(&v), size));
-        $group.bench_with_input("strings", &gen_strings(), |b, s| b.iter(|| $hash(black_box(s))));
+        $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| b.iter(|| $hash(black_box(s))));
+        $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| b.iter(|| $hash(black_box(s))));
     };
 }
 

From cd512ae6a158a8182b65fd60c96a2b7dd6596bfa Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 15:03:06 -0500
Subject: [PATCH 2/6] misc: format code

---
 src/aes_hash.rs   |  14 ++---
 src/operations.rs | 142 +++++++++++++++++++++++-----------------------
 tests/bench.rs    |   8 ++-
 3 files changed, 83 insertions(+), 81 deletions(-)

diff --git a/src/aes_hash.rs b/src/aes_hash.rs
index 42c27b0..44d1108 100644
--- a/src/aes_hash.rs
+++ b/src/aes_hash.rs
@@ -40,9 +40,9 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
         aesenc(hasher.key, tail[6]),
         aesdec(hasher.key, tail[7]),
     ];
-    let tail : [Vector256;4] = transmute!(tail);
-    let mut current : [Vector256;4] = transmute!(current);
-    let mut sum : [Vector256; 2] = [
+    let tail: [Vector256; 4] = transmute!(tail);
+    let mut current: [Vector256; 4] = transmute!(current);
+    let mut sum: [Vector256; 2] = [
         transmute!([hasher.key, !hasher.key]),
         transmute!([hasher.key, !hasher.key]),
     ];
@@ -52,7 +52,7 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
     sum[1] = shuffle_and_add_vec256(sum[1], tail[3]);
     while data.len() > 128 {
         let (blocks, rest) = data.read_u128x8();
-        let blocks : [Vector256; 4] = transmute!(blocks);
+        let blocks: [Vector256; 4] = transmute!(blocks);
         current[0] = aesdec_vec256(current[0], blocks[0]);
         current[1] = aesdec_vec256(current[1], blocks[1]);
         current[2] = aesdec_vec256(current[2], blocks[2]);
@@ -63,8 +63,8 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
         sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]);
         *data = rest;
     }
-    let current : [[u128;2]; 4] = transmute!(current);
-    let sum : [[u128;2]; 2] = transmute!(sum);
+    let current: [[u128; 2]; 4] = transmute!(current);
+    let sum: [[u128; 2]; 2] = transmute!(sum);
     hasher.hash_in_2(
         aesdec(current[0][0], current[0][1]),
         aesdec(current[1][0], current[1][1]),
@@ -217,7 +217,7 @@ impl Hasher for AHasher {
             if data.len() > 32 {
                 if data.len() > 128 {
                     return hash_batch_128b(&mut data, self);
-                } 
+                }
                 if data.len() > 64 {
                     let tail = data.read_last_u128x4();
                     let mut current: [u128; 4] = [self.key; 4];
diff --git a/src/operations.rs b/src/operations.rs
index 969d60b..88a6f15 100644
--- a/src/operations.rs
+++ b/src/operations.rs
@@ -180,94 +180,93 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) {
     }
 }
 
-
 #[cfg(any(
     all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)),
     all(target_arch = "aarch64", target_feature = "aes", not(miri)),
     all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
 ))]
 mod vaes {
-use super::*;
-#[repr(C, align(32))]
-#[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)]
-pub(crate) struct Vector256([u128;2]);
-
-pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 {
-    cfg_if::cfg_if!{
-        if #[cfg(all(
-                any(target_arch = "x86", target_arch = "x86_64"), 
-                target_feature = "vaes", 
-                feature = "vaes", 
-                not(miri)
-            ))] {
-            use core::arch::x86_64::*;
-            unsafe {
-                transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor)))
+    use super::*;
+    #[repr(C, align(32))]
+    #[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)]
+    pub(crate) struct Vector256([u128; 2]);
+
+    pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "vaes",
+                    feature = "vaes",
+                    not(miri)
+                ))] {
+                use core::arch::x86_64::*;
+                unsafe {
+                    transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor)))
+                }
+            }
+            else {
+                Vector256(
+                    [
+                        aesdec(value.0[0], xor.0[0]),
+                        aesdec(value.0[1], xor.0[1]),
+                    ]
+                )
             }
-        }
-        else {
-            Vector256(
-                [
-                    aesdec(value.0[0], xor.0[0]),
-                    aesdec(value.0[1], xor.0[1]),
-                ]
-            )
         }
     }
-}
 
-pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 {
-    cfg_if::cfg_if!{
-        if #[cfg(all(
-            any(target_arch = "x86", target_arch = "x86_64"), 
-            target_feature = "vaes", 
-            feature = "vaes", 
-            not(miri)
-        ))] {
-            use core::arch::x86_64::*;
-            unsafe {
-                transmute!(_mm256_add_epi64(transmute!(a), transmute!(b)))
+    pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                use core::arch::x86_64::*;
+                unsafe {
+                    transmute!(_mm256_add_epi64(transmute!(a), transmute!(b)))
+                }
+            }
+            else {
+                Vector256(
+                    [
+                        add_by_64s(a.0[0], b.0[0]),
+                        add_by_64s(a.0[1], b.0[1]),
+                    ]
+                )
             }
-        }
-        else {
-            Vector256(
-                [
-                    add_by_64s(a.0[0], b.0[0]),
-                    add_by_64s(a.0[1], b.0[1]),
-                ]
-            )
         }
     }
-}
 
-pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 {
-    cfg_if::cfg_if!{
-        if #[cfg(all(
-            any(target_arch = "x86", target_arch = "x86_64"), 
-            target_feature = "vaes", 
-            feature = "vaes", 
-            not(miri)
-        ))] {
-            unsafe {
-                use core::arch::x86_64::*;
-                let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]);
-                transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask))
+    pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                unsafe {
+                    use core::arch::x86_64::*;
+                    let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]);
+                    transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask))
+                }
+            }
+            else {
+                Vector256(
+                    [
+                        shuffle(value.0[0]),
+                        shuffle(value.0[1]),
+                    ]
+                )
             }
-        }
-        else {
-            Vector256(
-                [
-                    shuffle(value.0[0]),
-                    shuffle(value.0[1]),
-                ]
-            )
         }
     }
-}
 
-pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 {
-    add_by_64s_vec256(shuffle_vec256(base), to_add)
-}
+    pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 {
+        add_by_64s_vec256(shuffle_vec256(base), to_add)
+    }
 }
 
 #[cfg(any(
@@ -277,12 +276,11 @@ pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vect
 ))]
 pub(crate) use vaes::*;
 
-
 #[cfg(test)]
 mod test {
     use super::*;
     use crate::convert::Convert;
-    
+
     // This is code to search for the shuffle constant
     //
     //thread_local! { static MASK: Cell<u128> = Cell::new(0); }
diff --git a/tests/bench.rs b/tests/bench.rs
index 245455f..24a08a6 100644
--- a/tests/bench.rs
+++ b/tests/bench.rs
@@ -84,8 +84,12 @@ macro_rules! bench_inputs {
         $group.bench_function("u32", |b| b.iter_batched(|| rng.gen::<u32>(), |v| $hash(&v), size));
         $group.bench_function("u64", |b| b.iter_batched(|| rng.gen::<u64>(), |v| $hash(&v), size));
         $group.bench_function("u128", |b| b.iter_batched(|| rng.gen::<u128>(), |v| $hash(&v), size));
-        $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| b.iter(|| $hash(black_box(s))));
-        $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| b.iter(|| $hash(black_box(s))));
+        $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| {
+            b.iter(|| $hash(black_box(s)))
+        });
+        $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| {
+            b.iter(|| $hash(black_box(s)))
+        });
     };
 }
 

From cc352e5a4e3187fa099c9f438c56cad224eaab80 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 17:15:59 -0500
Subject: [PATCH 3/6] adjust performance and testings

---
 smhasher/ahash-cbindings/Cargo.toml |   5 +-
 smhasher/ahash-cbindings/install.sh |  10 +-
 smhasher/ahash-cbindings/src/lib.rs |   1 -
 src/aes_hash.rs                     |  54 ++++----
 src/operations.rs                   | 186 ++++++++++++++++++++++++----
 5 files changed, 201 insertions(+), 55 deletions(-)

diff --git a/smhasher/ahash-cbindings/Cargo.toml b/smhasher/ahash-cbindings/Cargo.toml
index 49513b3..5ac69e0 100644
--- a/smhasher/ahash-cbindings/Cargo.toml
+++ b/smhasher/ahash-cbindings/Cargo.toml
@@ -17,4 +17,7 @@ lto = 'fat'
 debug-assertions = false
 
 [dependencies]
-ahash = { path = "../../", default-features = false }
\ No newline at end of file
+ahash = { path = "../../", default-features = false }
+
+[features]
+vaes = ["ahash/vaes"]
diff --git a/smhasher/ahash-cbindings/install.sh b/smhasher/ahash-cbindings/install.sh
index 2effe93..7f9142f 100755
--- a/smhasher/ahash-cbindings/install.sh
+++ b/smhasher/ahash-cbindings/install.sh
@@ -1 +1,9 @@
-RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build --release && sudo cp target/release/libahash_c.a /usr/local/lib/
+
+# check if args contains vaes
+if [[ $* == *vaes* ]]; then
+    export CARGO_OPTS="--features=vaes"
+else
+    export CARGO_OPTS=""
+fi
+
+RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build ${CARGO_OPTS} --release && sudo cp target/release/libahash_c.a /usr/local/lib/
diff --git a/smhasher/ahash-cbindings/src/lib.rs b/smhasher/ahash-cbindings/src/lib.rs
index e828f12..9c8fb05 100644
--- a/smhasher/ahash-cbindings/src/lib.rs
+++ b/smhasher/ahash-cbindings/src/lib.rs
@@ -1,6 +1,5 @@
 use ahash::*;
 use core::slice;
-use std::hash::{BuildHasher};
 
 #[no_mangle]
 pub extern "C" fn ahash64(buf: *const (), len: usize, seed: u64) -> u64 {
diff --git a/src/aes_hash.rs b/src/aes_hash.rs
index 44d1108..62e82b5 100644
--- a/src/aes_hash.rs
+++ b/src/aes_hash.rs
@@ -27,52 +27,47 @@ pub struct AHasher {
     all(target_arch = "aarch64", target_feature = "aes", not(miri)),
     all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
 ))]
+#[inline(never)]
 fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
-    use zerocopy::transmute;
-    let tail = data.read_last_u128x8();
-    let current = [
-        aesenc(hasher.key, tail[0]),
-        aesdec(hasher.key, tail[1]),
-        aesenc(hasher.key, tail[2]),
-        aesdec(hasher.key, tail[3]),
-        aesenc(hasher.key, tail[4]),
-        aesdec(hasher.key, tail[5]),
-        aesenc(hasher.key, tail[6]),
-        aesdec(hasher.key, tail[7]),
-    ];
-    let tail: [Vector256; 4] = transmute!(tail);
-    let mut current: [Vector256; 4] = transmute!(current);
-    let mut sum: [Vector256; 2] = [
-        transmute!([hasher.key, !hasher.key]),
-        transmute!([hasher.key, !hasher.key]),
+    let tail = read_last4_vec256(data);
+    let mut current = [
+        aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[0]),
+        aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[1]),
+        aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[2]),
+        aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[3]),
     ];
+    let mut sum: [Vector256; 2] = [convert_u128_to_vec256(hasher.key), convert_u128_to_vec256(!hasher.key)];
     sum[0] = add_by_64s_vec256(sum[0], tail[0]);
-    sum[0] = add_by_64s_vec256(sum[0], tail[1]);
-    sum[1] = shuffle_and_add_vec256(sum[1], tail[2]);
+    sum[1] = add_by_64s_vec256(sum[1], tail[1]);
+    sum[0] = shuffle_and_add_vec256(sum[0], tail[2]);
     sum[1] = shuffle_and_add_vec256(sum[1], tail[3]);
     while data.len() > 128 {
-        let (blocks, rest) = data.read_u128x8();
-        let blocks: [Vector256; 4] = transmute!(blocks);
+        let (blocks, rest) = read4_vec256(data);
         current[0] = aesdec_vec256(current[0], blocks[0]);
         current[1] = aesdec_vec256(current[1], blocks[1]);
         current[2] = aesdec_vec256(current[2], blocks[2]);
         current[3] = aesdec_vec256(current[3], blocks[3]);
         sum[0] = shuffle_and_add_vec256(sum[0], blocks[0]);
-        sum[0] = shuffle_and_add_vec256(sum[0], blocks[1]);
-        sum[1] = shuffle_and_add_vec256(sum[1], blocks[2]);
+        sum[1] = shuffle_and_add_vec256(sum[1], blocks[1]);
+        sum[0] = shuffle_and_add_vec256(sum[0], blocks[2]);
         sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]);
         *data = rest;
     }
-    let current: [[u128; 2]; 4] = transmute!(current);
-    let sum: [[u128; 2]; 2] = transmute!(sum);
+    let current = [
+        convert_vec256_to_u128(current[0]),
+        convert_vec256_to_u128(current[1]),
+        convert_vec256_to_u128(current[2]),
+        convert_vec256_to_u128(current[3]),
+    ];
+    let sum = [convert_vec256_to_u128(sum[0]), convert_vec256_to_u128(sum[1])];
     hasher.hash_in_2(
-        aesdec(current[0][0], current[0][1]),
-        aesdec(current[1][0], current[1][1]),
+        aesenc(current[0][0], current[0][1]),
+        aesenc(current[1][0], current[1][1]),
     );
     hasher.hash_in(add_by_64s(sum[0][0].convert(), sum[0][1].convert()).convert());
     hasher.hash_in_2(
-        aesdec(current[2][0], current[2][1]),
-        aesdec(current[3][0], current[3][1]),
+        aesenc(current[2][0], current[2][1]),
+        aesenc(current[3][0], current[3][1]),
     );
     hasher.hash_in(add_by_64s(sum[1][0].convert(), sum[1][1].convert()).convert());
 }
@@ -102,6 +97,7 @@ impl AHasher {
     ///
     /// println!("Hash is {:x}!", hasher.finish());
     /// ```
+    #[allow(unused)]
     #[inline]
     pub(crate) fn new_with_keys(key1: u128, key2: u128) -> Self {
         let pi: [u128; 2] = PI.convert();
diff --git a/src/operations.rs b/src/operations.rs
index 88a6f15..c4fc93e 100644
--- a/src/operations.rs
+++ b/src/operations.rs
@@ -187,10 +187,44 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) {
 ))]
 mod vaes {
     use super::*;
-    #[repr(C, align(32))]
-    #[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)]
-    pub(crate) struct Vector256([u128; 2]);
+    cfg_if::cfg_if! {
+        if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+            pub type Vector256 = core::arch::x86_64::__m256i;
+        }
+        else {
+            pub type Vector256 = [u128;2];
+        }
+    }
 
+    #[inline(always)]
+    pub(crate) fn aesenc_vec256(value: Vector256, xor: Vector256) -> Vector256 {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                    any(target_arch = "x86", target_arch = "x86_64"),
+                    target_feature = "vaes",
+                    feature = "vaes",
+                    not(miri)
+                ))] {
+                use core::arch::x86_64::*;
+                unsafe {
+                    _mm256_aesenc_epi128(value, xor)
+                }
+            }
+            else {
+                    [
+                        aesenc(value[0], xor[0]),
+                        aesenc(value[1], xor[1]),
+                    ]
+            }
+        }
+    }
+
+    #[inline(always)]
     pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 {
         cfg_if::cfg_if! {
             if #[cfg(all(
@@ -201,20 +235,19 @@ mod vaes {
                 ))] {
                 use core::arch::x86_64::*;
                 unsafe {
-                    transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor)))
+                    _mm256_aesdec_epi128(value, xor)
                 }
             }
             else {
-                Vector256(
                     [
-                        aesdec(value.0[0], xor.0[0]),
-                        aesdec(value.0[1], xor.0[1]),
+                        aesdec(value[0], xor[0]),
+                        aesdec(value[1], xor[1]),
                     ]
-                )
             }
         }
     }
 
+    #[inline(always)]
     pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 {
         cfg_if::cfg_if! {
             if #[cfg(all(
@@ -224,21 +257,18 @@ mod vaes {
                 not(miri)
             ))] {
                 use core::arch::x86_64::*;
-                unsafe {
-                    transmute!(_mm256_add_epi64(transmute!(a), transmute!(b)))
-                }
+                unsafe { _mm256_add_epi64(a, b) }
             }
             else {
-                Vector256(
-                    [
-                        add_by_64s(a.0[0], b.0[0]),
-                        add_by_64s(a.0[1], b.0[1]),
-                    ]
-                )
+                [
+                    transmute!(add_by_64s(transmute!(a[0]), transmute!(b[0]))),
+                    transmute!(add_by_64s(transmute!(a[1]), transmute!(b[1]))),
+                ]
             }
         }
     }
 
+    #[inline(always)]
     pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 {
         cfg_if::cfg_if! {
             if #[cfg(all(
@@ -249,17 +279,21 @@ mod vaes {
             ))] {
                 unsafe {
                     use core::arch::x86_64::*;
-                    let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]);
-                    transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask))
+                    let mask : __m256i = _mm256_set_epi64x(
+                        SHUFFLE_MASK as i64,
+                        (SHUFFLE_MASK >> 64) as i64,
+                        SHUFFLE_MASK as i64,
+                        (SHUFFLE_MASK >> 64) as i64,
+                    );
+                    _mm256_shuffle_epi8(value, mask)
                 }
             }
             else {
-                Vector256(
+
                     [
-                        shuffle(value.0[0]),
-                        shuffle(value.0[1]),
+                        shuffle(value[0]),
+                        shuffle(value[1]),
                     ]
-                )
             }
         }
     }
@@ -267,6 +301,112 @@ mod vaes {
     pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 {
         add_by_64s_vec256(shuffle_vec256(base), to_add)
     }
+
+    // We specialize this routine because sometimes the compiler is not able to
+    // optimize it properly.
+    pub(crate) fn read4_vec256(data: &[u8]) -> ([Vector256; 4], &[u8]) {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                use core::arch::x86_64::*;
+                let (arr, rem) = data.split_at(128);
+                let arr = unsafe {
+                   [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()),
+                   ]
+                };
+                (arr, rem)
+            }
+            else {
+                let (arr, slice) = data.read_u128x8();
+                (transmute!(arr), slice)
+            }
+        }
+    }
+
+    // We specialize this routine because sometimes the compiler is not able to
+    // optimize it properly.
+    pub(crate) fn read_last4_vec256(data: &[u8]) -> [Vector256; 4] {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                use core::arch::x86_64::*;
+                let (_, arr) = data.split_at(data.len() - 128);
+                let arr = unsafe {
+                   [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()),
+                     _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()),
+                   ]
+                };
+                arr
+            }
+            else {
+                let arr = data.read_last_u128x8();
+                transmute!(arr)
+            }
+        }
+    }
+
+    // We specialize this routine because sometimes the compiler is not able to
+    // optimize it properly.
+    pub(crate) fn convert_u128_to_vec256(x: u128) -> Vector256 {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                use core::arch::x86_64::*;
+                unsafe {
+                    _mm256_set_epi64x(
+                        x as i64,
+                        (x >> 64) as i64,
+                        x as i64,
+                        (x >> 64) as i64,
+                    )
+                }
+            }
+            else {
+                transmute!([x, x])
+            }
+        }
+    }
+
+    // We specialize this routine because sometimes the compiler is not able to
+    // optimize it properly.
+    pub(crate) fn convert_vec256_to_u128(x: Vector256) -> [u128; 2] {
+        cfg_if::cfg_if! {
+            if #[cfg(all(
+                any(target_arch = "x86", target_arch = "x86_64"),
+                target_feature = "vaes",
+                feature = "vaes",
+                not(miri)
+            ))] {
+                use core::arch::x86_64::*;
+                unsafe {
+                    [
+                        transmute!(_mm256_extracti128_si256(x, 0)),
+                        transmute!(_mm256_extracti128_si256(x, 1)),
+                    ]
+                }
+            }
+            else {
+                x
+            }
+        }
+    }
 }
 
 #[cfg(any(

From f86bd295055cd5fcc8d0766544e86023cd9406cb Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 17:17:48 -0500
Subject: [PATCH 4/6] remove never inline

---
 src/aes_hash.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/aes_hash.rs b/src/aes_hash.rs
index 62e82b5..ed7052d 100644
--- a/src/aes_hash.rs
+++ b/src/aes_hash.rs
@@ -27,7 +27,6 @@ pub struct AHasher {
     all(target_arch = "aarch64", target_feature = "aes", not(miri)),
     all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)),
 ))]
-#[inline(never)]
 fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) {
     let tail = read_last4_vec256(data);
     let mut current = [

From 93a8ebb79d531d8e00e28af12d450550fff81c36 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 17:19:02 -0500
Subject: [PATCH 5/6] remove extra flags

---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index e2989fc..258e1a9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -87,7 +87,7 @@ serde = { version = "1.0.117", optional = true }
 cfg-if = "1.0"
 atomic-polyfill = { version="1.0.1", optional=true}
 getrandom = { version = "0.2.7", optional = true }
-zerocopy = { version = "0.7.20", default-features = false, features = ["simd", "derive"] }
+zerocopy = { version = "0.7.20", default-features = false, features = ["simd"] }
 
 [target.'cfg(not(all(target_arch = "arm", target_os = "none")))'.dependencies]
 once_cell = { version = "1.18.0", default-features = false, features = ["alloc"] }

From 6a4f02423e76bc376c8a1610a2731882600be346 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 21 Nov 2023 17:36:42 -0500
Subject: [PATCH 6/6] correct shuffle masks

---
 src/operations.rs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/operations.rs b/src/operations.rs
index c4fc93e..911202c 100644
--- a/src/operations.rs
+++ b/src/operations.rs
@@ -279,12 +279,7 @@ mod vaes {
             ))] {
                 unsafe {
                     use core::arch::x86_64::*;
-                    let mask : __m256i = _mm256_set_epi64x(
-                        SHUFFLE_MASK as i64,
-                        (SHUFFLE_MASK >> 64) as i64,
-                        SHUFFLE_MASK as i64,
-                        (SHUFFLE_MASK >> 64) as i64,
-                    );
+                    let mask = convert_u128_to_vec256(SHUFFLE_MASK);
                     _mm256_shuffle_epi8(value, mask)
                 }
             }
@@ -371,10 +366,10 @@ mod vaes {
                 use core::arch::x86_64::*;
                 unsafe {
                     _mm256_set_epi64x(
-                        x as i64,
                         (x >> 64) as i64,
                         x as i64,
                         (x >> 64) as i64,
+                        x as i64,
                     )
                 }
             }