From 716c876aad264cf880ed7b7a002d37d5383fc4c9 Mon Sep 17 00:00:00 2001 From: Zhang Li Date: Thu, 24 Oct 2024 10:38:40 +0800 Subject: [PATCH] replace gxhash with foldhash (#624) Co-authored-by: zhangli20 --- Cargo.lock | 17 +++++++---------- native-engine/datafusion-ext-plans/Cargo.toml | 2 +- .../datafusion-ext-plans/src/agg/acc.rs | 5 ++++- .../src/agg/agg_hash_map.rs | 9 +++++++-- .../datafusion-ext-plans/src/agg/agg_table.rs | 6 +++++- .../src/joins/join_hash_map.rs | 8 +++++++- 6 files changed, 31 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e8a728b3..ec4eb6f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -978,9 +978,9 @@ dependencies = [ "datafusion-ext-exprs", "datafusion-ext-functions", "derivative", + "foldhash", "futures", "futures-util", - "gxhash", "hashbrown 0.14.5", "itertools 0.13.0", "jni", @@ -1314,6 +1314,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -1451,15 +1457,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" -[[package]] -name = "gxhash" -version = "3.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" -dependencies = [ - "rustversion", -] - [[package]] name = "half" version = "2.4.1" diff --git a/native-engine/datafusion-ext-plans/Cargo.toml b/native-engine/datafusion-ext-plans/Cargo.toml index 3ef15253..f35f6c87 100644 --- a/native-engine/datafusion-ext-plans/Cargo.toml +++ b/native-engine/datafusion-ext-plans/Cargo.toml @@ -22,8 +22,8 @@ datafusion-ext-commons = { workspace = true } datafusion-ext-exprs = { workspace = true } datafusion-ext-functions = { workspace = true } derivative = "2.2.0" +foldhash = "0.1.3" futures = "0.3" -gxhash = "3.4.1" hashbrown = "0.14.5" itertools = "0.13.0" jni = "0.20.0" diff --git a/native-engine/datafusion-ext-plans/src/agg/acc.rs b/native-engine/datafusion-ext-plans/src/agg/acc.rs index bea1e9d9..5c6ecee9 100644 --- a/native-engine/datafusion-ext-plans/src/agg/acc.rs +++ b/native-engine/datafusion-ext-plans/src/agg/acc.rs @@ -14,6 +14,7 @@ use std::{ any::Any, + hash::BuildHasher, io::{Cursor, Read, Write}, mem::{size_of, size_of_val}, }; @@ -932,7 +933,9 @@ impl InternalSet { #[inline] pub fn acc_hash(value: impl AsRef<[u8]>) -> u64 { const ACC_HASH_SEED: u32 = 0x7BCB48DA; - gxhash::gxhash64(value.as_ref(), ACC_HASH_SEED as i64) + const HASHER: foldhash::fast::FixedState = + foldhash::fast::FixedState::with_seed(ACC_HASH_SEED as u64); + HASHER.hash_one(value.as_ref()) } impl AggDynSet { diff --git a/native-engine/datafusion-ext-plans/src/agg/agg_hash_map.rs b/native-engine/datafusion-ext-plans/src/agg/agg_hash_map.rs index 0816b882..e95f603a 100644 --- a/native-engine/datafusion-ext-plans/src/agg/agg_hash_map.rs +++ b/native-engine/datafusion-ext-plans/src/agg/agg_hash_map.rs @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::simd::{cmp::SimdPartialEq, Simd}; +use std::{ + hash::BuildHasher, + simd::{cmp::SimdPartialEq, Simd}, +}; use datafusion::common::Result; use datafusion_ext_commons::{ @@ -236,5 +239,7 @@ impl AggHashMap { pub fn agg_hash(value: impl AsRef<[u8]>) -> u32 { // 32-bits non-zero hash const AGG_HASH_SEED_HASHING: i64 = 0x3F6F1B93; - gxhash::gxhash32(value.as_ref(), AGG_HASH_SEED_HASHING) | 0x80000000 + const HASHER: foldhash::fast::FixedState = + foldhash::fast::FixedState::with_seed(AGG_HASH_SEED_HASHING as u64); + HASHER.hash_one(value.as_ref()) as u32 | 0x80000000 } diff --git a/native-engine/datafusion-ext-plans/src/agg/agg_table.rs b/native-engine/datafusion-ext-plans/src/agg/agg_table.rs index f226d422..1e8e38e6 100644 --- a/native-engine/datafusion-ext-plans/src/agg/agg_table.rs +++ b/native-engine/datafusion-ext-plans/src/agg/agg_table.rs @@ -13,6 +13,7 @@ // limitations under the License. use std::{ + hash::BuildHasher, io::{Cursor, Read, Write}, sync::{Arc, Weak}, }; @@ -819,5 +820,8 @@ impl<'a> KeyForRadixTournamentTree for RecordsSpillCursor<'a> { #[inline] fn bucket_id(key: impl AsRef<[u8]>) -> u16 { const AGG_HASH_SEED_HASHING: i64 = 0xC732BD66; - (gxhash::gxhash32(key.as_ref(), AGG_HASH_SEED_HASHING) % NUM_SPILL_BUCKETS as u32) as u16 + const HASHER: foldhash::fast::FixedState = + foldhash::fast::FixedState::with_seed(AGG_HASH_SEED_HASHING as u64); + let hash = HASHER.hash_one(key.as_ref()) as u32; + (hash % NUM_SPILL_BUCKETS as u32) as u16 } diff --git a/native-engine/datafusion-ext-plans/src/joins/join_hash_map.rs b/native-engine/datafusion-ext-plans/src/joins/join_hash_map.rs index fa517cf1..cf04b260 100644 --- a/native-engine/datafusion-ext-plans/src/joins/join_hash_map.rs +++ b/native-engine/datafusion-ext-plans/src/joins/join_hash_map.rs @@ -14,6 +14,7 @@ use std::{ fmt::{Debug, Formatter}, + hash::{BuildHasher, Hasher}, io::Cursor, mem::MaybeUninit, simd::{cmp::SimdPartialEq, Simd}, @@ -434,8 +435,13 @@ pub fn join_hash_map_schema(data_schema: &SchemaRef) -> SchemaRef { #[inline] pub fn join_create_hashes(num_rows: usize, key_columns: &[ArrayRef]) -> Vec { const JOIN_HASH_RANDOM_SEED: u32 = 0x1E39FA04; + const HASHER: foldhash::fast::FixedState = + foldhash::fast::FixedState::with_seed(JOIN_HASH_RANDOM_SEED as u64); let mut hashes = create_hashes(num_rows, key_columns, JOIN_HASH_RANDOM_SEED, |v, h| { - gxhash::gxhash32(v, h as i64) + let mut hasher = HASHER.build_hasher(); + hasher.write_u32(h); + hasher.write(v); + hasher.finish() as u32 }); // use 31-bit non-zero hash