From 033470f26ac356a032a6d5c9ce26e03048658bbe Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 15:02:36 -0500 Subject: [PATCH 1/6] feat: add VAES support --- .gitignore | 1 + Cargo.toml | 5 ++- src/aes_hash.rs | 58 +++++++++++++++++++++++++++ src/convert.rs | 14 +++++++ src/lib.rs | 1 + src/operations.rs | 100 +++++++++++++++++++++++++++++++++++++++++++++- tests/bench.rs | 10 +++-- 7 files changed, 183 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index fc89f1b..490a548 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ +.vscode/ Cargo.lock target diff --git a/Cargo.toml b/Cargo.toml index d37c2de..e2989fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,9 @@ atomic-polyfill = [ "dep:atomic-polyfill", "once_cell/atomic-polyfill"] # Nightly-only support for AES intrinsics on 32-bit ARM nightly-arm-aes = [] +# Nightly-only support for VAES intrinsics with 256 SIMD registers +vaes = [] + [[bench]] name = "ahash" path = "tests/bench.rs" @@ -84,7 +87,7 @@ serde = { version = "1.0.117", optional = true } cfg-if = "1.0" atomic-polyfill = { version="1.0.1", optional=true} getrandom = { version = "0.2.7", optional = true } -zerocopy = { version = "0.7.20", default-features = false, features = ["simd"] } +zerocopy = { version = "0.7.20", default-features = false, features = ["simd", "derive"] } [target.'cfg(not(all(target_arch = "arm", target_os = "none")))'.dependencies] once_cell = { version = "1.18.0", default-features = false, features = ["alloc"] } diff --git a/src/aes_hash.rs b/src/aes_hash.rs index 0b9a1d4..42c27b0 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -22,6 +22,61 @@ pub struct AHasher { key: u128, } +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { + use zerocopy::transmute; + let tail = data.read_last_u128x8(); + let current = [ + aesenc(hasher.key, tail[0]), + aesdec(hasher.key, tail[1]), + aesenc(hasher.key, tail[2]), + aesdec(hasher.key, tail[3]), + aesenc(hasher.key, tail[4]), + aesdec(hasher.key, tail[5]), + aesenc(hasher.key, tail[6]), + aesdec(hasher.key, tail[7]), + ]; + let tail : [Vector256;4] = transmute!(tail); + let mut current : [Vector256;4] = transmute!(current); + let mut sum : [Vector256; 2] = [ + transmute!([hasher.key, !hasher.key]), + transmute!([hasher.key, !hasher.key]), + ]; + sum[0] = add_by_64s_vec256(sum[0], tail[0]); + sum[0] = add_by_64s_vec256(sum[0], tail[1]); + sum[1] = shuffle_and_add_vec256(sum[1], tail[2]); + sum[1] = shuffle_and_add_vec256(sum[1], tail[3]); + while data.len() > 128 { + let (blocks, rest) = data.read_u128x8(); + let blocks : [Vector256; 4] = transmute!(blocks); + current[0] = aesdec_vec256(current[0], blocks[0]); + current[1] = aesdec_vec256(current[1], blocks[1]); + current[2] = aesdec_vec256(current[2], blocks[2]); + current[3] = aesdec_vec256(current[3], blocks[3]); + sum[0] = shuffle_and_add_vec256(sum[0], blocks[0]); + sum[0] = shuffle_and_add_vec256(sum[0], blocks[1]); + sum[1] = shuffle_and_add_vec256(sum[1], blocks[2]); + sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]); + *data = rest; + } + let current : [[u128;2]; 4] = transmute!(current); + let sum : [[u128;2]; 2] = transmute!(sum); + hasher.hash_in_2( + aesdec(current[0][0], current[0][1]), + aesdec(current[1][0], current[1][1]), + ); + hasher.hash_in(add_by_64s(sum[0][0].convert(), sum[0][1].convert()).convert()); + hasher.hash_in_2( + aesdec(current[2][0], current[2][1]), + aesdec(current[3][0], current[3][1]), + ); + hasher.hash_in(add_by_64s(sum[1][0].convert(), sum[1][1].convert()).convert()); +} + impl AHasher { /// Creates a new hasher keyed to the provided keys. /// @@ -160,6 +215,9 @@ impl Hasher for AHasher { self.hash_in(value.convert()); } else { if data.len() > 32 { + if data.len() > 128 { + return hash_batch_128b(&mut data, self); + } if data.len() > 64 { let tail = data.read_last_u128x4(); let mut current: [u128; 4] = [self.key; 4]; diff --git a/src/convert.rs b/src/convert.rs index 712eae1..7541824 100644 --- a/src/convert.rs +++ b/src/convert.rs @@ -19,6 +19,7 @@ macro_rules! convert { }; } +convert!([u128; 8], [u8; 128]); convert!([u128; 4], [u64; 8]); convert!([u128; 4], [u32; 16]); convert!([u128; 4], [u16; 32]); @@ -79,12 +80,14 @@ pub(crate) trait ReadFromSlice { fn read_u128(&self) -> (u128, &[u8]); fn read_u128x2(&self) -> ([u128; 2], &[u8]); fn read_u128x4(&self) -> ([u128; 4], &[u8]); + fn read_u128x8(&self) -> ([u128; 8], &[u8]); fn read_last_u16(&self) -> u16; fn read_last_u32(&self) -> u32; fn read_last_u64(&self) -> u64; fn read_last_u128(&self) -> u128; fn read_last_u128x2(&self) -> [u128; 2]; fn read_last_u128x4(&self) -> [u128; 4]; + fn read_last_u128x8(&self) -> [u128; 8]; } impl ReadFromSlice for [u8] { @@ -124,6 +127,12 @@ impl ReadFromSlice for [u8] { (as_array!(value, 64).convert(), rest) } + #[inline(always)] + fn read_u128x8(&self) -> ([u128; 8], &[u8]) { + let (value, rest) = self.split_at(128); + (as_array!(value, 128).convert(), rest) + } + #[inline(always)] fn read_last_u16(&self) -> u16 { let (_, value) = self.split_at(self.len() - 2); @@ -159,4 +168,9 @@ impl ReadFromSlice for [u8] { let (_, value) = self.split_at(self.len() - 64); as_array!(value, 64).convert() } + + fn read_last_u128x8(&self) -> [u128; 8] { + let (_, value) = self.split_at(self.len() - 128); + as_array!(value, 128).convert() + } } diff --git a/src/lib.rs b/src/lib.rs index 2086513..5f7551e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,6 +99,7 @@ Note the import of [HashMapExt]. This is needed for the constructor. #![cfg_attr(all(not(test), not(feature = "std")), no_std)] #![cfg_attr(feature = "specialize", feature(min_specialization))] #![cfg_attr(feature = "nightly-arm-aes", feature(stdarch_arm_neon_intrinsics))] +#![cfg_attr(feature = "vaes", feature(stdsimd))] #[macro_use] mod convert; diff --git a/src/operations.rs b/src/operations.rs index a420587..969d60b 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -180,11 +180,109 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) { } } + +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +mod vaes { +use super::*; +#[repr(C, align(32))] +#[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)] +pub(crate) struct Vector256([u128;2]); + +pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 { + cfg_if::cfg_if!{ + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor))) + } + } + else { + Vector256( + [ + aesdec(value.0[0], xor.0[0]), + aesdec(value.0[1], xor.0[1]), + ] + ) + } + } +} + +pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 { + cfg_if::cfg_if!{ + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + transmute!(_mm256_add_epi64(transmute!(a), transmute!(b))) + } + } + else { + Vector256( + [ + add_by_64s(a.0[0], b.0[0]), + add_by_64s(a.0[1], b.0[1]), + ] + ) + } + } +} + +pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 { + cfg_if::cfg_if!{ + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + unsafe { + use core::arch::x86_64::*; + let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]); + transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask)) + } + } + else { + Vector256( + [ + shuffle(value.0[0]), + shuffle(value.0[1]), + ] + ) + } + } +} + +pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 { + add_by_64s_vec256(shuffle_vec256(base), to_add) +} +} + +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +pub(crate) use vaes::*; + + #[cfg(test)] mod test { use super::*; use crate::convert::Convert; - + // This is code to search for the shuffle constant // //thread_local! { static MASK: Cell = Cell::new(0); } diff --git a/tests/bench.rs b/tests/bench.rs index e038ba4..245455f 100644 --- a/tests/bench.rs +++ b/tests/bench.rs @@ -56,10 +56,11 @@ fn seahash(b: &H) -> u64 { hasher.finish() } -const STRING_LENGTHS: [u32; 12] = [1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024]; +const STRING_LENGTHS: &'static [u32; 12] = &[1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024]; +const WIDER_STRINGS_LENGTHS: &'static [u32] = &[1, 64, 1024, 4096, 5261, 16384, 19997]; -fn gen_strings() -> Vec { - STRING_LENGTHS +fn gen_strings(lengths: &[u32]) -> Vec { + lengths .iter() .map(|len| { let mut string = String::default(); @@ -83,7 +84,8 @@ macro_rules! bench_inputs { $group.bench_function("u32", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u64", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u128", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); - $group.bench_with_input("strings", &gen_strings(), |b, s| b.iter(|| $hash(black_box(s)))); + $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| b.iter(|| $hash(black_box(s)))); + $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| b.iter(|| $hash(black_box(s)))); }; } From cd512ae6a158a8182b65fd60c96a2b7dd6596bfa Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 15:03:06 -0500 Subject: [PATCH 2/6] misc: format code --- src/aes_hash.rs | 14 ++--- src/operations.rs | 142 +++++++++++++++++++++++----------------------- tests/bench.rs | 8 ++- 3 files changed, 83 insertions(+), 81 deletions(-) diff --git a/src/aes_hash.rs b/src/aes_hash.rs index 42c27b0..44d1108 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -40,9 +40,9 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { aesenc(hasher.key, tail[6]), aesdec(hasher.key, tail[7]), ]; - let tail : [Vector256;4] = transmute!(tail); - let mut current : [Vector256;4] = transmute!(current); - let mut sum : [Vector256; 2] = [ + let tail: [Vector256; 4] = transmute!(tail); + let mut current: [Vector256; 4] = transmute!(current); + let mut sum: [Vector256; 2] = [ transmute!([hasher.key, !hasher.key]), transmute!([hasher.key, !hasher.key]), ]; @@ -52,7 +52,7 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { sum[1] = shuffle_and_add_vec256(sum[1], tail[3]); while data.len() > 128 { let (blocks, rest) = data.read_u128x8(); - let blocks : [Vector256; 4] = transmute!(blocks); + let blocks: [Vector256; 4] = transmute!(blocks); current[0] = aesdec_vec256(current[0], blocks[0]); current[1] = aesdec_vec256(current[1], blocks[1]); current[2] = aesdec_vec256(current[2], blocks[2]); @@ -63,8 +63,8 @@ fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]); *data = rest; } - let current : [[u128;2]; 4] = transmute!(current); - let sum : [[u128;2]; 2] = transmute!(sum); + let current: [[u128; 2]; 4] = transmute!(current); + let sum: [[u128; 2]; 2] = transmute!(sum); hasher.hash_in_2( aesdec(current[0][0], current[0][1]), aesdec(current[1][0], current[1][1]), @@ -217,7 +217,7 @@ impl Hasher for AHasher { if data.len() > 32 { if data.len() > 128 { return hash_batch_128b(&mut data, self); - } + } if data.len() > 64 { let tail = data.read_last_u128x4(); let mut current: [u128; 4] = [self.key; 4]; diff --git a/src/operations.rs b/src/operations.rs index 969d60b..88a6f15 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -180,94 +180,93 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) { } } - #[cfg(any( all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), all(target_arch = "aarch64", target_feature = "aes", not(miri)), all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), ))] mod vaes { -use super::*; -#[repr(C, align(32))] -#[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)] -pub(crate) struct Vector256([u128;2]); - -pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 { - cfg_if::cfg_if!{ - if #[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature = "vaes", - feature = "vaes", - not(miri) - ))] { - use core::arch::x86_64::*; - unsafe { - transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor))) + use super::*; + #[repr(C, align(32))] + #[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)] + pub(crate) struct Vector256([u128; 2]); + + pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor))) + } + } + else { + Vector256( + [ + aesdec(value.0[0], xor.0[0]), + aesdec(value.0[1], xor.0[1]), + ] + ) } - } - else { - Vector256( - [ - aesdec(value.0[0], xor.0[0]), - aesdec(value.0[1], xor.0[1]), - ] - ) } } -} -pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 { - cfg_if::cfg_if!{ - if #[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature = "vaes", - feature = "vaes", - not(miri) - ))] { - use core::arch::x86_64::*; - unsafe { - transmute!(_mm256_add_epi64(transmute!(a), transmute!(b))) + pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + transmute!(_mm256_add_epi64(transmute!(a), transmute!(b))) + } + } + else { + Vector256( + [ + add_by_64s(a.0[0], b.0[0]), + add_by_64s(a.0[1], b.0[1]), + ] + ) } - } - else { - Vector256( - [ - add_by_64s(a.0[0], b.0[0]), - add_by_64s(a.0[1], b.0[1]), - ] - ) } } -} -pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 { - cfg_if::cfg_if!{ - if #[cfg(all( - any(target_arch = "x86", target_arch = "x86_64"), - target_feature = "vaes", - feature = "vaes", - not(miri) - ))] { - unsafe { - use core::arch::x86_64::*; - let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]); - transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask)) + pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + unsafe { + use core::arch::x86_64::*; + let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]); + transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask)) + } + } + else { + Vector256( + [ + shuffle(value.0[0]), + shuffle(value.0[1]), + ] + ) } - } - else { - Vector256( - [ - shuffle(value.0[0]), - shuffle(value.0[1]), - ] - ) } } -} -pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 { - add_by_64s_vec256(shuffle_vec256(base), to_add) -} + pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 { + add_by_64s_vec256(shuffle_vec256(base), to_add) + } } #[cfg(any( @@ -277,12 +276,11 @@ pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vect ))] pub(crate) use vaes::*; - #[cfg(test)] mod test { use super::*; use crate::convert::Convert; - + // This is code to search for the shuffle constant // //thread_local! { static MASK: Cell = Cell::new(0); } diff --git a/tests/bench.rs b/tests/bench.rs index 245455f..24a08a6 100644 --- a/tests/bench.rs +++ b/tests/bench.rs @@ -84,8 +84,12 @@ macro_rules! bench_inputs { $group.bench_function("u32", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u64", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u128", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); - $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| b.iter(|| $hash(black_box(s)))); - $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| b.iter(|| $hash(black_box(s)))); + $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| { + b.iter(|| $hash(black_box(s))) + }); + $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| { + b.iter(|| $hash(black_box(s))) + }); }; } From cc352e5a4e3187fa099c9f438c56cad224eaab80 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 17:15:59 -0500 Subject: [PATCH 3/6] adjust performance and testings --- smhasher/ahash-cbindings/Cargo.toml | 5 +- smhasher/ahash-cbindings/install.sh | 10 +- smhasher/ahash-cbindings/src/lib.rs | 1 - src/aes_hash.rs | 54 ++++---- src/operations.rs | 186 ++++++++++++++++++++++++---- 5 files changed, 201 insertions(+), 55 deletions(-) diff --git a/smhasher/ahash-cbindings/Cargo.toml b/smhasher/ahash-cbindings/Cargo.toml index 49513b3..5ac69e0 100644 --- a/smhasher/ahash-cbindings/Cargo.toml +++ b/smhasher/ahash-cbindings/Cargo.toml @@ -17,4 +17,7 @@ lto = 'fat' debug-assertions = false [dependencies] -ahash = { path = "../../", default-features = false } \ No newline at end of file +ahash = { path = "../../", default-features = false } + +[features] +vaes = ["ahash/vaes"] diff --git a/smhasher/ahash-cbindings/install.sh b/smhasher/ahash-cbindings/install.sh index 2effe93..7f9142f 100755 --- a/smhasher/ahash-cbindings/install.sh +++ b/smhasher/ahash-cbindings/install.sh @@ -1 +1,9 @@ -RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build --release && sudo cp target/release/libahash_c.a /usr/local/lib/ + +# check if args contains vaes +if [[ $* == *vaes* ]]; then + export CARGO_OPTS="--features=vaes" +else + export CARGO_OPTS="" +fi + +RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build ${CARGO_OPTS} --release && sudo cp target/release/libahash_c.a /usr/local/lib/ diff --git a/smhasher/ahash-cbindings/src/lib.rs b/smhasher/ahash-cbindings/src/lib.rs index e828f12..9c8fb05 100644 --- a/smhasher/ahash-cbindings/src/lib.rs +++ b/smhasher/ahash-cbindings/src/lib.rs @@ -1,6 +1,5 @@ use ahash::*; use core::slice; -use std::hash::{BuildHasher}; #[no_mangle] pub extern "C" fn ahash64(buf: *const (), len: usize, seed: u64) -> u64 { diff --git a/src/aes_hash.rs b/src/aes_hash.rs index 44d1108..62e82b5 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -27,52 +27,47 @@ pub struct AHasher { all(target_arch = "aarch64", target_feature = "aes", not(miri)), all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), ))] +#[inline(never)] fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { - use zerocopy::transmute; - let tail = data.read_last_u128x8(); - let current = [ - aesenc(hasher.key, tail[0]), - aesdec(hasher.key, tail[1]), - aesenc(hasher.key, tail[2]), - aesdec(hasher.key, tail[3]), - aesenc(hasher.key, tail[4]), - aesdec(hasher.key, tail[5]), - aesenc(hasher.key, tail[6]), - aesdec(hasher.key, tail[7]), - ]; - let tail: [Vector256; 4] = transmute!(tail); - let mut current: [Vector256; 4] = transmute!(current); - let mut sum: [Vector256; 2] = [ - transmute!([hasher.key, !hasher.key]), - transmute!([hasher.key, !hasher.key]), + let tail = read_last4_vec256(data); + let mut current = [ + aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[0]), + aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[1]), + aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[2]), + aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[3]), ]; + let mut sum: [Vector256; 2] = [convert_u128_to_vec256(hasher.key), convert_u128_to_vec256(!hasher.key)]; sum[0] = add_by_64s_vec256(sum[0], tail[0]); - sum[0] = add_by_64s_vec256(sum[0], tail[1]); - sum[1] = shuffle_and_add_vec256(sum[1], tail[2]); + sum[1] = add_by_64s_vec256(sum[1], tail[1]); + sum[0] = shuffle_and_add_vec256(sum[0], tail[2]); sum[1] = shuffle_and_add_vec256(sum[1], tail[3]); while data.len() > 128 { - let (blocks, rest) = data.read_u128x8(); - let blocks: [Vector256; 4] = transmute!(blocks); + let (blocks, rest) = read4_vec256(data); current[0] = aesdec_vec256(current[0], blocks[0]); current[1] = aesdec_vec256(current[1], blocks[1]); current[2] = aesdec_vec256(current[2], blocks[2]); current[3] = aesdec_vec256(current[3], blocks[3]); sum[0] = shuffle_and_add_vec256(sum[0], blocks[0]); - sum[0] = shuffle_and_add_vec256(sum[0], blocks[1]); - sum[1] = shuffle_and_add_vec256(sum[1], blocks[2]); + sum[1] = shuffle_and_add_vec256(sum[1], blocks[1]); + sum[0] = shuffle_and_add_vec256(sum[0], blocks[2]); sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]); *data = rest; } - let current: [[u128; 2]; 4] = transmute!(current); - let sum: [[u128; 2]; 2] = transmute!(sum); + let current = [ + convert_vec256_to_u128(current[0]), + convert_vec256_to_u128(current[1]), + convert_vec256_to_u128(current[2]), + convert_vec256_to_u128(current[3]), + ]; + let sum = [convert_vec256_to_u128(sum[0]), convert_vec256_to_u128(sum[1])]; hasher.hash_in_2( - aesdec(current[0][0], current[0][1]), - aesdec(current[1][0], current[1][1]), + aesenc(current[0][0], current[0][1]), + aesenc(current[1][0], current[1][1]), ); hasher.hash_in(add_by_64s(sum[0][0].convert(), sum[0][1].convert()).convert()); hasher.hash_in_2( - aesdec(current[2][0], current[2][1]), - aesdec(current[3][0], current[3][1]), + aesenc(current[2][0], current[2][1]), + aesenc(current[3][0], current[3][1]), ); hasher.hash_in(add_by_64s(sum[1][0].convert(), sum[1][1].convert()).convert()); } @@ -102,6 +97,7 @@ impl AHasher { /// /// println!("Hash is {:x}!", hasher.finish()); /// ``` + #[allow(unused)] #[inline] pub(crate) fn new_with_keys(key1: u128, key2: u128) -> Self { let pi: [u128; 2] = PI.convert(); diff --git a/src/operations.rs b/src/operations.rs index 88a6f15..c4fc93e 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -187,10 +187,44 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) { ))] mod vaes { use super::*; - #[repr(C, align(32))] - #[derive(zerocopy::AsBytes, zerocopy::FromZeroes, zerocopy::FromBytes, Copy, Clone)] - pub(crate) struct Vector256([u128; 2]); + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + pub type Vector256 = core::arch::x86_64::__m256i; + } + else { + pub type Vector256 = [u128;2]; + } + } + #[inline(always)] + pub(crate) fn aesenc_vec256(value: Vector256, xor: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + _mm256_aesenc_epi128(value, xor) + } + } + else { + [ + aesenc(value[0], xor[0]), + aesenc(value[1], xor[1]), + ] + } + } + } + + #[inline(always)] pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 { cfg_if::cfg_if! { if #[cfg(all( @@ -201,20 +235,19 @@ mod vaes { ))] { use core::arch::x86_64::*; unsafe { - transmute!(_mm256_aesdec_epi128(transmute!(value), transmute!(xor))) + _mm256_aesdec_epi128(value, xor) } } else { - Vector256( [ - aesdec(value.0[0], xor.0[0]), - aesdec(value.0[1], xor.0[1]), + aesdec(value[0], xor[0]), + aesdec(value[1], xor[1]), ] - ) } } } + #[inline(always)] pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 { cfg_if::cfg_if! { if #[cfg(all( @@ -224,21 +257,18 @@ mod vaes { not(miri) ))] { use core::arch::x86_64::*; - unsafe { - transmute!(_mm256_add_epi64(transmute!(a), transmute!(b))) - } + unsafe { _mm256_add_epi64(a, b) } } else { - Vector256( - [ - add_by_64s(a.0[0], b.0[0]), - add_by_64s(a.0[1], b.0[1]), - ] - ) + [ + transmute!(add_by_64s(transmute!(a[0]), transmute!(b[0]))), + transmute!(add_by_64s(transmute!(a[1]), transmute!(b[1]))), + ] } } } + #[inline(always)] pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 { cfg_if::cfg_if! { if #[cfg(all( @@ -249,17 +279,21 @@ mod vaes { ))] { unsafe { use core::arch::x86_64::*; - let mask = transmute!([SHUFFLE_MASK, SHUFFLE_MASK]); - transmute!(_mm256_shuffle_epi8(transmute!(value.0), mask)) + let mask : __m256i = _mm256_set_epi64x( + SHUFFLE_MASK as i64, + (SHUFFLE_MASK >> 64) as i64, + SHUFFLE_MASK as i64, + (SHUFFLE_MASK >> 64) as i64, + ); + _mm256_shuffle_epi8(value, mask) } } else { - Vector256( + [ - shuffle(value.0[0]), - shuffle(value.0[1]), + shuffle(value[0]), + shuffle(value[1]), ] - ) } } } @@ -267,6 +301,112 @@ mod vaes { pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 { add_by_64s_vec256(shuffle_vec256(base), to_add) } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn read4_vec256(data: &[u8]) -> ([Vector256; 4], &[u8]) { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + let (arr, rem) = data.split_at(128); + let arr = unsafe { + [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()), + ] + }; + (arr, rem) + } + else { + let (arr, slice) = data.read_u128x8(); + (transmute!(arr), slice) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn read_last4_vec256(data: &[u8]) -> [Vector256; 4] { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + let (_, arr) = data.split_at(data.len() - 128); + let arr = unsafe { + [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()), + ] + }; + arr + } + else { + let arr = data.read_last_u128x8(); + transmute!(arr) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn convert_u128_to_vec256(x: u128) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + _mm256_set_epi64x( + x as i64, + (x >> 64) as i64, + x as i64, + (x >> 64) as i64, + ) + } + } + else { + transmute!([x, x]) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn convert_vec256_to_u128(x: Vector256) -> [u128; 2] { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + [ + transmute!(_mm256_extracti128_si256(x, 0)), + transmute!(_mm256_extracti128_si256(x, 1)), + ] + } + } + else { + x + } + } + } } #[cfg(any( From f86bd295055cd5fcc8d0766544e86023cd9406cb Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 17:17:48 -0500 Subject: [PATCH 4/6] remove never inline --- src/aes_hash.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aes_hash.rs b/src/aes_hash.rs index 62e82b5..ed7052d 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -27,7 +27,6 @@ pub struct AHasher { all(target_arch = "aarch64", target_feature = "aes", not(miri)), all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), ))] -#[inline(never)] fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { let tail = read_last4_vec256(data); let mut current = [ From 93a8ebb79d531d8e00e28af12d450550fff81c36 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 17:19:02 -0500 Subject: [PATCH 5/6] remove extra flags --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index e2989fc..258e1a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,7 +87,7 @@ serde = { version = "1.0.117", optional = true } cfg-if = "1.0" atomic-polyfill = { version="1.0.1", optional=true} getrandom = { version = "0.2.7", optional = true } -zerocopy = { version = "0.7.20", default-features = false, features = ["simd", "derive"] } +zerocopy = { version = "0.7.20", default-features = false, features = ["simd"] } [target.'cfg(not(all(target_arch = "arm", target_os = "none")))'.dependencies] once_cell = { version = "1.18.0", default-features = false, features = ["alloc"] } From 6a4f02423e76bc376c8a1610a2731882600be346 Mon Sep 17 00:00:00 2001 From: Schrodinger ZHU Yifan Date: Tue, 21 Nov 2023 17:36:42 -0500 Subject: [PATCH 6/6] correct shuffle masks --- src/operations.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/operations.rs b/src/operations.rs index c4fc93e..911202c 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -279,12 +279,7 @@ mod vaes { ))] { unsafe { use core::arch::x86_64::*; - let mask : __m256i = _mm256_set_epi64x( - SHUFFLE_MASK as i64, - (SHUFFLE_MASK >> 64) as i64, - SHUFFLE_MASK as i64, - (SHUFFLE_MASK >> 64) as i64, - ); + let mask = convert_u128_to_vec256(SHUFFLE_MASK); _mm256_shuffle_epi8(value, mask) } } @@ -371,10 +366,10 @@ mod vaes { use core::arch::x86_64::*; unsafe { _mm256_set_epi64x( - x as i64, (x >> 64) as i64, x as i64, (x >> 64) as i64, + x as i64, ) } }