/Users/andrewlamb/Software/datafusion/datafusion/common/src/hash_utils.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Functionality used both on logical and physical plans |
19 | | |
20 | | #[cfg(not(feature = "force_hash_collisions"))] |
21 | | use std::sync::Arc; |
22 | | |
23 | | use ahash::RandomState; |
24 | | use arrow::array::*; |
25 | | use arrow::datatypes::*; |
26 | | #[cfg(not(feature = "force_hash_collisions"))] |
27 | | use arrow::{downcast_dictionary_array, downcast_primitive_array}; |
28 | | use arrow_buffer::IntervalDayTime; |
29 | | use arrow_buffer::IntervalMonthDayNano; |
30 | | |
31 | | #[cfg(not(feature = "force_hash_collisions"))] |
32 | | use crate::cast::{ |
33 | | as_binary_view_array, as_boolean_array, as_fixed_size_list_array, |
34 | | as_generic_binary_array, as_large_list_array, as_list_array, as_map_array, |
35 | | as_primitive_array, as_string_array, as_string_view_array, as_struct_array, |
36 | | }; |
37 | | use crate::error::Result; |
38 | | #[cfg(not(feature = "force_hash_collisions"))] |
39 | | use crate::error::_internal_err; |
40 | | |
41 | | // Combines two hashes into one hash |
42 | | #[inline] |
43 | 65.7k | pub fn combine_hashes(l: u64, r: u64) -> u64 { |
44 | 65.7k | let hash = (17 * 37u64).wrapping_add(l); |
45 | 65.7k | hash.wrapping_mul(37).wrapping_add(r) |
46 | 65.7k | } |
47 | | |
48 | | #[cfg(not(feature = "force_hash_collisions"))] |
49 | 0 | fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) { |
50 | 0 | if mul_col { |
51 | 0 | hashes_buffer.iter_mut().for_each(|hash| { |
52 | 0 | // stable hash for null value |
53 | 0 | *hash = combine_hashes(random_state.hash_one(1), *hash); |
54 | 0 | }) |
55 | | } else { |
56 | 0 | hashes_buffer.iter_mut().for_each(|hash| { |
57 | 0 | *hash = random_state.hash_one(1); |
58 | 0 | }) |
59 | | } |
60 | 0 | } |
61 | | |
62 | | pub trait HashValue { |
63 | | fn hash_one(&self, state: &RandomState) -> u64; |
64 | | } |
65 | | |
66 | | impl<'a, T: HashValue + ?Sized> HashValue for &'a T { |
67 | 20 | fn hash_one(&self, state: &RandomState) -> u64 { |
68 | 20 | T::hash_one(self, state) |
69 | 20 | } |
70 | | } |
71 | | |
72 | | macro_rules! hash_value { |
73 | | ($($t:ty),+) => { |
74 | | $(impl HashValue for $t { |
75 | 134k | fn hash_one(&self, state: &RandomState) -> u64 { |
76 | 134k | state.hash_one(self) |
77 | 134k | } |
78 | | })+ |
79 | | }; |
80 | | } |
81 | | hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64); |
82 | | hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano); |
83 | | |
84 | | macro_rules! hash_float_value { |
85 | | ($(($t:ty, $i:ty)),+) => { |
86 | | $(impl HashValue for $t { |
87 | 65.7k | fn hash_one(&self, state: &RandomState) -> u64 { |
88 | 65.7k | state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes())) |
89 | 65.7k | } |
90 | | })+ |
91 | | }; |
92 | | } |
93 | | hash_float_value!((half::f16, u16), (f32, u32), (f64, u64)); |
94 | | |
95 | | /// Builds hash values of PrimitiveArray and writes them into `hashes_buffer` |
96 | | /// If `rehash==true` this combines the previous hash value in the buffer |
97 | | /// with the new hash using `combine_hashes` |
98 | | #[cfg(not(feature = "force_hash_collisions"))] |
99 | 29.5k | fn hash_array_primitive<T>( |
100 | 29.5k | array: &PrimitiveArray<T>, |
101 | 29.5k | random_state: &RandomState, |
102 | 29.5k | hashes_buffer: &mut [u64], |
103 | 29.5k | rehash: bool, |
104 | 29.5k | ) where |
105 | 29.5k | T: ArrowPrimitiveType, |
106 | 29.5k | <T as arrow_array::ArrowPrimitiveType>::Native: HashValue, |
107 | 29.5k | { |
108 | 29.5k | assert_eq!( |
109 | 29.5k | hashes_buffer.len(), |
110 | 29.5k | array.len(), |
111 | 0 | "hashes_buffer and array should be of equal length" |
112 | | ); |
113 | | |
114 | 29.5k | if array.null_count() == 0 { |
115 | 29.4k | if rehash { |
116 | 65.7k | for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter())69 { |
117 | 65.7k | *hash = combine_hashes(value.hash_one(random_state), *hash); |
118 | 65.7k | } |
119 | | } else { |
120 | 133k | for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter())29.4k { |
121 | 133k | *hash = value.hash_one(random_state); |
122 | 133k | } |
123 | | } |
124 | 72 | } else if rehash { |
125 | 131k | for (i, hash) in hashes_buffer.iter_mut().enumerate()36 { |
126 | 131k | if !array.is_null(i) { |
127 | 20 | let value = unsafe { array.value_unchecked(i) }; |
128 | 20 | *hash = combine_hashes(value.hash_one(random_state), *hash); |
129 | 131k | } |
130 | | } |
131 | | } else { |
132 | 65.6k | for (i, hash) in hashes_buffer.iter_mut().enumerate()36 { |
133 | 65.6k | if !array.is_null(i) { |
134 | 30 | let value = unsafe { array.value_unchecked(i) }; |
135 | 30 | *hash = value.hash_one(random_state); |
136 | 65.6k | } |
137 | | } |
138 | | } |
139 | 29.5k | } |
140 | | |
141 | | /// Hashes one array into the `hashes_buffer` |
142 | | /// If `rehash==true` this combines the previous hash value in the buffer |
143 | | /// with the new hash using `combine_hashes` |
144 | | #[cfg(not(feature = "force_hash_collisions"))] |
145 | 11 | fn hash_array<T>( |
146 | 11 | array: T, |
147 | 11 | random_state: &RandomState, |
148 | 11 | hashes_buffer: &mut [u64], |
149 | 11 | rehash: bool, |
150 | 11 | ) where |
151 | 11 | T: ArrayAccessor, |
152 | 11 | T::Item: HashValue, |
153 | 11 | { |
154 | 11 | assert_eq!( |
155 | 11 | hashes_buffer.len(), |
156 | 11 | array.len(), |
157 | 0 | "hashes_buffer and array should be of equal length" |
158 | | ); |
159 | | |
160 | 11 | if array.null_count() == 0 { |
161 | 11 | if rehash { |
162 | 0 | for (i, hash) in hashes_buffer.iter_mut().enumerate() { |
163 | 0 | let value = unsafe { array.value_unchecked(i) }; |
164 | 0 | *hash = combine_hashes(value.hash_one(random_state), *hash); |
165 | 0 | } |
166 | | } else { |
167 | 20 | for (i, hash) in hashes_buffer.iter_mut().enumerate()11 { |
168 | 20 | let value = unsafe { array.value_unchecked(i) }; |
169 | 20 | *hash = value.hash_one(random_state); |
170 | 20 | } |
171 | | } |
172 | 0 | } else if rehash { |
173 | 0 | for (i, hash) in hashes_buffer.iter_mut().enumerate() { |
174 | 0 | if !array.is_null(i) { |
175 | 0 | let value = unsafe { array.value_unchecked(i) }; |
176 | 0 | *hash = combine_hashes(value.hash_one(random_state), *hash); |
177 | 0 | } |
178 | | } |
179 | | } else { |
180 | 0 | for (i, hash) in hashes_buffer.iter_mut().enumerate() { |
181 | 0 | if !array.is_null(i) { |
182 | 0 | let value = unsafe { array.value_unchecked(i) }; |
183 | 0 | *hash = value.hash_one(random_state); |
184 | 0 | } |
185 | | } |
186 | | } |
187 | 11 | } |
188 | | |
189 | | /// Hash the values in a dictionary array |
190 | | #[cfg(not(feature = "force_hash_collisions"))] |
191 | 2 | fn hash_dictionary<K: ArrowDictionaryKeyType>( |
192 | 2 | array: &DictionaryArray<K>, |
193 | 2 | random_state: &RandomState, |
194 | 2 | hashes_buffer: &mut [u64], |
195 | 2 | multi_col: bool, |
196 | 2 | ) -> Result<()> { |
197 | 2 | // Hash each dictionary value once, and then use that computed |
198 | 2 | // hash for each key value to avoid a potentially expensive |
199 | 2 | // redundant hashing for large dictionary elements (e.g. strings) |
200 | 2 | let values = Arc::clone(array.values()); |
201 | 2 | let mut dict_hashes = vec![0; values.len()]; |
202 | 2 | create_hashes(&[values], random_state, &mut dict_hashes)?0 ; |
203 | | |
204 | | // combine hash for each index in values |
205 | 2 | if multi_col { |
206 | 3 | for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter())1 { |
207 | 3 | if let Some(key) = key { |
208 | 3 | *hash = combine_hashes(dict_hashes[key.as_usize()], *hash) |
209 | 0 | } // no update for Null, consistent with other hashes |
210 | | } |
211 | | } else { |
212 | 3 | for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter())1 { |
213 | 3 | if let Some(key2 ) = key { |
214 | 2 | *hash = dict_hashes[key.as_usize()] |
215 | 1 | } // no update for Null, consistent with other hashes |
216 | | } |
217 | | } |
218 | 2 | Ok(()) |
219 | 2 | } |
220 | | |
221 | | #[cfg(not(feature = "force_hash_collisions"))] |
222 | 7 | fn hash_struct_array( |
223 | 7 | array: &StructArray, |
224 | 7 | random_state: &RandomState, |
225 | 7 | hashes_buffer: &mut [u64], |
226 | 7 | ) -> Result<()> { |
227 | 7 | let nulls = array.nulls(); |
228 | 7 | let row_len = array.len(); |
229 | | |
230 | 7 | let valid_row_indices: Vec<usize> = if let Some(nulls4 ) = nulls { |
231 | 4 | nulls.valid_indices().collect() |
232 | | } else { |
233 | 3 | (0..row_len).collect() |
234 | | }; |
235 | | |
236 | | // Create hashes for each row that combines the hashes over all the column at that row. |
237 | 7 | let mut values_hashes = vec![0u64; row_len]; |
238 | 7 | create_hashes(array.columns(), random_state, &mut values_hashes)?0 ; |
239 | | |
240 | 18 | for i11 in valid_row_indices { |
241 | 11 | let hash = &mut hashes_buffer[i]; |
242 | 11 | *hash = combine_hashes(*hash, values_hashes[i]); |
243 | 11 | } |
244 | | |
245 | 7 | Ok(()) |
246 | 7 | } |
247 | | |
248 | | // only adding this `cfg` b/c this function is only used with this `cfg` |
249 | | #[cfg(not(feature = "force_hash_collisions"))] |
250 | 0 | fn hash_map_array( |
251 | 0 | array: &MapArray, |
252 | 0 | random_state: &RandomState, |
253 | 0 | hashes_buffer: &mut [u64], |
254 | 0 | ) -> Result<()> { |
255 | 0 | let nulls = array.nulls(); |
256 | 0 | let offsets = array.offsets(); |
257 | 0 |
|
258 | 0 | // Create hashes for each entry in each row |
259 | 0 | let mut values_hashes = vec![0u64; array.entries().len()]; |
260 | 0 | create_hashes(array.entries().columns(), random_state, &mut values_hashes)?; |
261 | | |
262 | | // Combine the hashes for entries on each row with each other and previous hash for that row |
263 | 0 | if let Some(nulls) = nulls { |
264 | 0 | for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { |
265 | 0 | if nulls.is_valid(i) { |
266 | 0 | let hash = &mut hashes_buffer[i]; |
267 | 0 | for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { |
268 | 0 | *hash = combine_hashes(*hash, *values_hash); |
269 | 0 | } |
270 | 0 | } |
271 | | } |
272 | | } else { |
273 | 0 | for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { |
274 | 0 | let hash = &mut hashes_buffer[i]; |
275 | 0 | for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { |
276 | 0 | *hash = combine_hashes(*hash, *values_hash); |
277 | 0 | } |
278 | | } |
279 | | } |
280 | | |
281 | 0 | Ok(()) |
282 | 0 | } |
283 | | |
284 | | #[cfg(not(feature = "force_hash_collisions"))] |
285 | 0 | fn hash_list_array<OffsetSize>( |
286 | 0 | array: &GenericListArray<OffsetSize>, |
287 | 0 | random_state: &RandomState, |
288 | 0 | hashes_buffer: &mut [u64], |
289 | 0 | ) -> Result<()> |
290 | 0 | where |
291 | 0 | OffsetSize: OffsetSizeTrait, |
292 | 0 | { |
293 | 0 | let values = Arc::clone(array.values()); |
294 | 0 | let offsets = array.value_offsets(); |
295 | 0 | let nulls = array.nulls(); |
296 | 0 | let mut values_hashes = vec![0u64; values.len()]; |
297 | 0 | create_hashes(&[values], random_state, &mut values_hashes)?; |
298 | 0 | if let Some(nulls) = nulls { |
299 | 0 | for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { |
300 | 0 | if nulls.is_valid(i) { |
301 | 0 | let hash = &mut hashes_buffer[i]; |
302 | 0 | for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { |
303 | 0 | *hash = combine_hashes(*hash, *values_hash); |
304 | 0 | } |
305 | 0 | } |
306 | | } |
307 | | } else { |
308 | 0 | for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() { |
309 | 0 | let hash = &mut hashes_buffer[i]; |
310 | 0 | for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] { |
311 | 0 | *hash = combine_hashes(*hash, *values_hash); |
312 | 0 | } |
313 | | } |
314 | | } |
315 | 0 | Ok(()) |
316 | 0 | } |
317 | | |
318 | | #[cfg(not(feature = "force_hash_collisions"))] |
319 | 0 | fn hash_fixed_list_array( |
320 | 0 | array: &FixedSizeListArray, |
321 | 0 | random_state: &RandomState, |
322 | 0 | hashes_buffer: &mut [u64], |
323 | 0 | ) -> Result<()> { |
324 | 0 | let values = Arc::clone(array.values()); |
325 | 0 | let value_len = array.value_length(); |
326 | 0 | let offset_size = value_len as usize / array.len(); |
327 | 0 | let nulls = array.nulls(); |
328 | 0 | let mut values_hashes = vec![0u64; values.len()]; |
329 | 0 | create_hashes(&[values], random_state, &mut values_hashes)?; |
330 | 0 | if let Some(nulls) = nulls { |
331 | 0 | for i in 0..array.len() { |
332 | 0 | if nulls.is_valid(i) { |
333 | 0 | let hash = &mut hashes_buffer[i]; |
334 | 0 | for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] |
335 | 0 | { |
336 | 0 | *hash = combine_hashes(*hash, *values_hash); |
337 | 0 | } |
338 | 0 | } |
339 | | } |
340 | | } else { |
341 | 0 | for i in 0..array.len() { |
342 | 0 | let hash = &mut hashes_buffer[i]; |
343 | 0 | for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] { |
344 | 0 | *hash = combine_hashes(*hash, *values_hash); |
345 | 0 | } |
346 | | } |
347 | | } |
348 | 0 | Ok(()) |
349 | 0 | } |
350 | | |
351 | | /// Test version of `create_hashes` that produces the same value for |
352 | | /// all hashes (to test collisions) |
353 | | /// |
354 | | /// See comments on `hashes_buffer` for more details |
355 | | #[cfg(feature = "force_hash_collisions")] |
356 | | pub fn create_hashes<'a>( |
357 | | _arrays: &[ArrayRef], |
358 | | _random_state: &RandomState, |
359 | | hashes_buffer: &'a mut Vec<u64>, |
360 | | ) -> Result<&'a mut Vec<u64>> { |
361 | | for hash in hashes_buffer.iter_mut() { |
362 | | *hash = 0 |
363 | | } |
364 | | Ok(hashes_buffer) |
365 | | } |
366 | | |
367 | | /// Creates hash values for every row, based on the values in the |
368 | | /// columns. |
369 | | /// |
370 | | /// The number of rows to hash is determined by `hashes_buffer.len()`. |
371 | | /// `hashes_buffer` should be pre-sized appropriately |
372 | | #[cfg(not(feature = "force_hash_collisions"))] |
373 | 29.4k | pub fn create_hashes<'a>( |
374 | 29.4k | arrays: &[ArrayRef], |
375 | 29.4k | random_state: &RandomState, |
376 | 29.4k | hashes_buffer: &'a mut Vec<u64>, |
377 | 29.4k | ) -> Result<&'a mut Vec<u64>> { |
378 | 29.5k | for (i, col) in arrays.iter().enumerate()29.4k { |
379 | 29.5k | let array = col.as_ref(); |
380 | 29.5k | // combine hashes with `combine_hashes` for all columns besides the first |
381 | 29.5k | let rehash = i >= 1; |
382 | 29.5k | downcast_primitive_array! { |
383 | 0 | array => hash_array_primitive(array, random_state, hashes_buffer, rehash), |
384 | 0 | DataType::Null => hash_null(random_state, hashes_buffer, rehash), |
385 | 0 | DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash), |
386 | 11 | DataType::Utf8 => hash_array(as_string_array(array)?0 , random_state, hashes_buffer, rehash), |
387 | 0 | DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash), |
388 | 0 | DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash), |
389 | 0 | DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash), |
390 | 0 | DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash), |
391 | 0 | DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash), |
392 | | DataType::FixedSizeBinary(_) => { |
393 | 0 | let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap(); |
394 | 0 | hash_array(array, random_state, hashes_buffer, rehash) |
395 | | } |
396 | | DataType::Decimal128(_, _) => { |
397 | 0 | let array = as_primitive_array::<Decimal128Type>(array)?; |
398 | 0 | hash_array_primitive(array, random_state, hashes_buffer, rehash) |
399 | | } |
400 | | DataType::Decimal256(_, _) => { |
401 | 0 | let array = as_primitive_array::<Decimal256Type>(array)?; |
402 | 0 | hash_array_primitive(array, random_state, hashes_buffer, rehash) |
403 | | } |
404 | 2 | DataType::Dictionary(_, _) => downcast_dictionary_array! { |
405 | 0 | array => hash_dictionary(array, random_state, hashes_buffer, rehash)?, |
406 | 0 | _ => unreachable!() |
407 | | } |
408 | | DataType::Struct(_) => { |
409 | 7 | let array = as_struct_array(array)?0 ; |
410 | 7 | hash_struct_array(array, random_state, hashes_buffer)?0 ; |
411 | | } |
412 | | DataType::List(_) => { |
413 | 0 | let array = as_list_array(array)?; |
414 | 0 | hash_list_array(array, random_state, hashes_buffer)?; |
415 | | } |
416 | | DataType::LargeList(_) => { |
417 | 0 | let array = as_large_list_array(array)?; |
418 | 0 | hash_list_array(array, random_state, hashes_buffer)?; |
419 | | } |
420 | | DataType::Map(_, _) => { |
421 | 0 | let array = as_map_array(array)?; |
422 | 0 | hash_map_array(array, random_state, hashes_buffer)?; |
423 | | } |
424 | | DataType::FixedSizeList(_,_) => { |
425 | 0 | let array = as_fixed_size_list_array(array)?; |
426 | 0 | hash_fixed_list_array(array, random_state, hashes_buffer)?; |
427 | | } |
428 | | _ => { |
429 | | // This is internal because we should have caught this before. |
430 | 0 | return _internal_err!( |
431 | 0 | "Unsupported data type in hasher: {}", |
432 | 0 | col.data_type() |
433 | 0 | ); |
434 | | } |
435 | | } |
436 | | } |
437 | 29.4k | Ok(hashes_buffer) |
438 | 29.4k | } |
439 | | |
440 | | #[cfg(test)] |
441 | | mod tests { |
442 | | use std::sync::Arc; |
443 | | |
444 | | use arrow::array::*; |
445 | | #[cfg(not(feature = "force_hash_collisions"))] |
446 | | use arrow::datatypes::*; |
447 | | |
448 | | use super::*; |
449 | | |
450 | | #[test] |
451 | | fn create_hashes_for_decimal_array() -> Result<()> { |
452 | | let array = vec![1, 2, 3, 4] |
453 | | .into_iter() |
454 | | .map(Some) |
455 | | .collect::<Decimal128Array>() |
456 | | .with_precision_and_scale(20, 3) |
457 | | .unwrap(); |
458 | | let array_ref = Arc::new(array); |
459 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
460 | | let hashes_buff = &mut vec![0; array_ref.len()]; |
461 | | let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?; |
462 | | assert_eq!(hashes.len(), 4); |
463 | | Ok(()) |
464 | | } |
465 | | |
466 | | #[test] |
467 | | fn create_hashes_for_float_arrays() -> Result<()> { |
468 | | let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7])); |
469 | | let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7])); |
470 | | |
471 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
472 | | let hashes_buff = &mut vec![0; f32_arr.len()]; |
473 | | let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?; |
474 | | assert_eq!(hashes.len(), 4,); |
475 | | |
476 | | let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?; |
477 | | assert_eq!(hashes.len(), 4,); |
478 | | |
479 | | Ok(()) |
480 | | } |
481 | | |
482 | | macro_rules! create_hash_binary { |
483 | | ($NAME:ident, $ARRAY:ty) => { |
484 | | #[cfg(not(feature = "force_hash_collisions"))] |
485 | | #[test] |
486 | | fn $NAME() { |
487 | | let binary = [ |
488 | | Some(b"short".to_byte_slice()), |
489 | | None, |
490 | | Some(b"long but different 12 bytes string"), |
491 | | Some(b"short2"), |
492 | | Some(b"Longer than 12 bytes string"), |
493 | | Some(b"short"), |
494 | | Some(b"Longer than 12 bytes string"), |
495 | | ]; |
496 | | |
497 | | let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>()); |
498 | | let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>()); |
499 | | |
500 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
501 | | |
502 | | let mut binary_hashes = vec![0; binary.len()]; |
503 | | create_hashes(&[binary_array], &random_state, &mut binary_hashes) |
504 | | .unwrap(); |
505 | | |
506 | | let mut ref_hashes = vec![0; binary.len()]; |
507 | | create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap(); |
508 | | |
509 | | // Null values result in a zero hash, |
510 | | for (val, hash) in binary.iter().zip(binary_hashes.iter()) { |
511 | | match val { |
512 | | Some(_) => assert_ne!(*hash, 0), |
513 | | None => assert_eq!(*hash, 0), |
514 | | } |
515 | | } |
516 | | |
517 | | // same logical values should hash to the same hash value |
518 | | assert_eq!(binary_hashes, ref_hashes); |
519 | | |
520 | | // Same values should map to same hash values |
521 | | assert_eq!(binary[0], binary[5]); |
522 | | assert_eq!(binary[4], binary[6]); |
523 | | |
524 | | // different binary should map to different hash values |
525 | | assert_ne!(binary[0], binary[2]); |
526 | | } |
527 | | }; |
528 | | } |
529 | | |
530 | | create_hash_binary!(binary_array, BinaryArray); |
531 | | create_hash_binary!(binary_view_array, BinaryViewArray); |
532 | | |
533 | | #[test] |
534 | | fn create_hashes_fixed_size_binary() -> Result<()> { |
535 | | let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]]; |
536 | | let fixed_size_binary_array = |
537 | | Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap()); |
538 | | |
539 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
540 | | let hashes_buff = &mut vec![0; fixed_size_binary_array.len()]; |
541 | | let hashes = |
542 | | create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?; |
543 | | assert_eq!(hashes.len(), 3,); |
544 | | |
545 | | Ok(()) |
546 | | } |
547 | | |
548 | | macro_rules! create_hash_string { |
549 | | ($NAME:ident, $ARRAY:ty) => { |
550 | | #[cfg(not(feature = "force_hash_collisions"))] |
551 | | #[test] |
552 | | fn $NAME() { |
553 | | let strings = [ |
554 | | Some("short"), |
555 | | None, |
556 | | Some("long but different 12 bytes string"), |
557 | | Some("short2"), |
558 | | Some("Longer than 12 bytes string"), |
559 | | Some("short"), |
560 | | Some("Longer than 12 bytes string"), |
561 | | ]; |
562 | | |
563 | | let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>()); |
564 | | let dict_array = Arc::new( |
565 | | strings |
566 | | .iter() |
567 | | .cloned() |
568 | | .collect::<DictionaryArray<Int8Type>>(), |
569 | | ); |
570 | | |
571 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
572 | | |
573 | | let mut string_hashes = vec![0; strings.len()]; |
574 | | create_hashes(&[string_array], &random_state, &mut string_hashes) |
575 | | .unwrap(); |
576 | | |
577 | | let mut dict_hashes = vec![0; strings.len()]; |
578 | | create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap(); |
579 | | |
580 | | // Null values result in a zero hash, |
581 | | for (val, hash) in strings.iter().zip(string_hashes.iter()) { |
582 | | match val { |
583 | | Some(_) => assert_ne!(*hash, 0), |
584 | | None => assert_eq!(*hash, 0), |
585 | | } |
586 | | } |
587 | | |
588 | | // same logical values should hash to the same hash value |
589 | | assert_eq!(string_hashes, dict_hashes); |
590 | | |
591 | | // Same values should map to same hash values |
592 | | assert_eq!(strings[0], strings[5]); |
593 | | assert_eq!(strings[4], strings[6]); |
594 | | |
595 | | // different strings should map to different hash values |
596 | | assert_ne!(strings[0], strings[2]); |
597 | | } |
598 | | }; |
599 | | } |
600 | | |
601 | | create_hash_string!(string_array, StringArray); |
602 | | create_hash_string!(large_string_array, LargeStringArray); |
603 | | create_hash_string!(string_view_array, StringArray); |
604 | | create_hash_string!(dict_string_array, DictionaryArray<Int8Type>); |
605 | | |
606 | | #[test] |
607 | | // Tests actual values of hashes, which are different if forcing collisions |
608 | | #[cfg(not(feature = "force_hash_collisions"))] |
609 | | fn create_hashes_for_dict_arrays() { |
610 | | let strings = [Some("foo"), None, Some("bar"), Some("foo"), None]; |
611 | | |
612 | | let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>()); |
613 | | let dict_array = Arc::new( |
614 | | strings |
615 | | .iter() |
616 | | .cloned() |
617 | | .collect::<DictionaryArray<Int8Type>>(), |
618 | | ); |
619 | | |
620 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
621 | | |
622 | | let mut string_hashes = vec![0; strings.len()]; |
623 | | create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap(); |
624 | | |
625 | | let mut dict_hashes = vec![0; strings.len()]; |
626 | | create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap(); |
627 | | |
628 | | // Null values result in a zero hash, |
629 | | for (val, hash) in strings.iter().zip(string_hashes.iter()) { |
630 | | match val { |
631 | | Some(_) => assert_ne!(*hash, 0), |
632 | | None => assert_eq!(*hash, 0), |
633 | | } |
634 | | } |
635 | | |
636 | | // same logical values should hash to the same hash value |
637 | | assert_eq!(string_hashes, dict_hashes); |
638 | | |
639 | | // Same values should map to same hash values |
640 | | assert_eq!(strings[1], strings[4]); |
641 | | assert_eq!(dict_hashes[1], dict_hashes[4]); |
642 | | assert_eq!(strings[0], strings[3]); |
643 | | assert_eq!(dict_hashes[0], dict_hashes[3]); |
644 | | |
645 | | // different strings should map to different hash values |
646 | | assert_ne!(strings[0], strings[2]); |
647 | | assert_ne!(dict_hashes[0], dict_hashes[2]); |
648 | | } |
649 | | |
650 | | #[test] |
651 | | // Tests actual values of hashes, which are different if forcing collisions |
652 | | #[cfg(not(feature = "force_hash_collisions"))] |
653 | | fn create_hashes_for_list_arrays() { |
654 | | let data = vec![ |
655 | | Some(vec![Some(0), Some(1), Some(2)]), |
656 | | None, |
657 | | Some(vec![Some(3), None, Some(5)]), |
658 | | Some(vec![Some(3), None, Some(5)]), |
659 | | None, |
660 | | Some(vec![Some(0), Some(1), Some(2)]), |
661 | | Some(vec![]), |
662 | | ]; |
663 | | let list_array = |
664 | | Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef; |
665 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
666 | | let mut hashes = vec![0; list_array.len()]; |
667 | | create_hashes(&[list_array], &random_state, &mut hashes).unwrap(); |
668 | | assert_eq!(hashes[0], hashes[5]); |
669 | | assert_eq!(hashes[1], hashes[4]); |
670 | | assert_eq!(hashes[2], hashes[3]); |
671 | | assert_eq!(hashes[1], hashes[6]); // null vs empty list |
672 | | } |
673 | | |
674 | | #[test] |
675 | | // Tests actual values of hashes, which are different if forcing collisions |
676 | | #[cfg(not(feature = "force_hash_collisions"))] |
677 | | fn create_hashes_for_fixed_size_list_arrays() { |
678 | | let data = vec![ |
679 | | Some(vec![Some(0), Some(1), Some(2)]), |
680 | | None, |
681 | | Some(vec![Some(3), None, Some(5)]), |
682 | | Some(vec![Some(3), None, Some(5)]), |
683 | | None, |
684 | | Some(vec![Some(0), Some(1), Some(2)]), |
685 | | ]; |
686 | | let list_array = |
687 | | Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>( |
688 | | data, 3, |
689 | | )) as ArrayRef; |
690 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
691 | | let mut hashes = vec![0; list_array.len()]; |
692 | | create_hashes(&[list_array], &random_state, &mut hashes).unwrap(); |
693 | | assert_eq!(hashes[0], hashes[5]); |
694 | | assert_eq!(hashes[1], hashes[4]); |
695 | | assert_eq!(hashes[2], hashes[3]); |
696 | | } |
697 | | |
698 | | #[test] |
699 | | // Tests actual values of hashes, which are different if forcing collisions |
700 | | #[cfg(not(feature = "force_hash_collisions"))] |
701 | | fn create_hashes_for_struct_arrays() { |
702 | | use arrow_buffer::Buffer; |
703 | | |
704 | | let boolarr = Arc::new(BooleanArray::from(vec![ |
705 | | false, false, true, true, true, true, |
706 | | ])); |
707 | | let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31])); |
708 | | |
709 | | let struct_array = StructArray::from(( |
710 | | vec![ |
711 | | ( |
712 | | Arc::new(Field::new("bool", DataType::Boolean, false)), |
713 | | Arc::clone(&boolarr) as ArrayRef, |
714 | | ), |
715 | | ( |
716 | | Arc::new(Field::new("i32", DataType::Int32, false)), |
717 | | Arc::clone(&i32arr) as ArrayRef, |
718 | | ), |
719 | | ( |
720 | | Arc::new(Field::new("i32", DataType::Int32, false)), |
721 | | Arc::clone(&i32arr) as ArrayRef, |
722 | | ), |
723 | | ( |
724 | | Arc::new(Field::new("bool", DataType::Boolean, false)), |
725 | | Arc::clone(&boolarr) as ArrayRef, |
726 | | ), |
727 | | ], |
728 | | Buffer::from(&[0b001011]), |
729 | | )); |
730 | | |
731 | | assert!(struct_array.is_valid(0)); |
732 | | assert!(struct_array.is_valid(1)); |
733 | | assert!(struct_array.is_null(2)); |
734 | | assert!(struct_array.is_valid(3)); |
735 | | assert!(struct_array.is_null(4)); |
736 | | assert!(struct_array.is_null(5)); |
737 | | |
738 | | let array = Arc::new(struct_array) as ArrayRef; |
739 | | |
740 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
741 | | let mut hashes = vec![0; array.len()]; |
742 | | create_hashes(&[array], &random_state, &mut hashes).unwrap(); |
743 | | assert_eq!(hashes[0], hashes[1]); |
744 | | // same value but the third row ( hashes[2] ) is null |
745 | | assert_ne!(hashes[2], hashes[3]); |
746 | | // different values but both are null |
747 | | assert_eq!(hashes[4], hashes[5]); |
748 | | } |
749 | | |
750 | | #[test] |
751 | | // Tests actual values of hashes, which are different if forcing collisions |
752 | | #[cfg(not(feature = "force_hash_collisions"))] |
753 | | fn create_hashes_for_struct_arrays_more_column_than_row() { |
754 | | let struct_array = StructArray::from(vec![ |
755 | | ( |
756 | | Arc::new(Field::new("bool", DataType::Boolean, false)), |
757 | | Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef, |
758 | | ), |
759 | | ( |
760 | | Arc::new(Field::new("i32-1", DataType::Int32, false)), |
761 | | Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, |
762 | | ), |
763 | | ( |
764 | | Arc::new(Field::new("i32-2", DataType::Int32, false)), |
765 | | Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, |
766 | | ), |
767 | | ( |
768 | | Arc::new(Field::new("i32-3", DataType::Int32, false)), |
769 | | Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef, |
770 | | ), |
771 | | ]); |
772 | | |
773 | | assert!(struct_array.is_valid(0)); |
774 | | assert!(struct_array.is_valid(1)); |
775 | | |
776 | | let array = Arc::new(struct_array) as ArrayRef; |
777 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
778 | | let mut hashes = vec![0; array.len()]; |
779 | | create_hashes(&[array], &random_state, &mut hashes).unwrap(); |
780 | | assert_eq!(hashes[0], hashes[1]); |
781 | | } |
782 | | |
783 | | #[test] |
784 | | // Tests actual values of hashes, which are different if forcing collisions |
785 | | #[cfg(not(feature = "force_hash_collisions"))] |
786 | | fn create_hashes_for_map_arrays() { |
787 | | let mut builder = |
788 | | MapBuilder::new(None, StringBuilder::new(), Int32Builder::new()); |
789 | | // Row 0 |
790 | | builder.keys().append_value("key1"); |
791 | | builder.keys().append_value("key2"); |
792 | | builder.values().append_value(1); |
793 | | builder.values().append_value(2); |
794 | | builder.append(true).unwrap(); |
795 | | // Row 1 |
796 | | builder.keys().append_value("key1"); |
797 | | builder.keys().append_value("key2"); |
798 | | builder.values().append_value(1); |
799 | | builder.values().append_value(2); |
800 | | builder.append(true).unwrap(); |
801 | | // Row 2 |
802 | | builder.keys().append_value("key1"); |
803 | | builder.keys().append_value("key2"); |
804 | | builder.values().append_value(1); |
805 | | builder.values().append_value(3); |
806 | | builder.append(true).unwrap(); |
807 | | // Row 3 |
808 | | builder.keys().append_value("key1"); |
809 | | builder.keys().append_value("key3"); |
810 | | builder.values().append_value(1); |
811 | | builder.values().append_value(2); |
812 | | builder.append(true).unwrap(); |
813 | | // Row 4 |
814 | | builder.keys().append_value("key1"); |
815 | | builder.values().append_value(1); |
816 | | builder.append(true).unwrap(); |
817 | | // Row 5 |
818 | | builder.keys().append_value("key1"); |
819 | | builder.values().append_null(); |
820 | | builder.append(true).unwrap(); |
821 | | // Row 6 |
822 | | builder.append(true).unwrap(); |
823 | | // Row 7 |
824 | | builder.keys().append_value("key1"); |
825 | | builder.values().append_value(1); |
826 | | builder.append(false).unwrap(); |
827 | | |
828 | | let array = Arc::new(builder.finish()) as ArrayRef; |
829 | | |
830 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
831 | | let mut hashes = vec![0; array.len()]; |
832 | | create_hashes(&[array], &random_state, &mut hashes).unwrap(); |
833 | | assert_eq!(hashes[0], hashes[1]); // same value |
834 | | assert_ne!(hashes[0], hashes[2]); // different value |
835 | | assert_ne!(hashes[0], hashes[3]); // different key |
836 | | assert_ne!(hashes[0], hashes[4]); // missing an entry |
837 | | assert_ne!(hashes[4], hashes[5]); // filled vs null value |
838 | | assert_eq!(hashes[6], hashes[7]); // empty vs null map |
839 | | } |
840 | | |
841 | | #[test] |
842 | | // Tests actual values of hashes, which are different if forcing collisions |
843 | | #[cfg(not(feature = "force_hash_collisions"))] |
844 | | fn create_multi_column_hash_for_dict_arrays() { |
845 | | let strings1 = [Some("foo"), None, Some("bar")]; |
846 | | let strings2 = [Some("blarg"), Some("blah"), None]; |
847 | | |
848 | | let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>()); |
849 | | let dict_array = Arc::new( |
850 | | strings2 |
851 | | .iter() |
852 | | .cloned() |
853 | | .collect::<DictionaryArray<Int32Type>>(), |
854 | | ); |
855 | | |
856 | | let random_state = RandomState::with_seeds(0, 0, 0, 0); |
857 | | |
858 | | let mut one_col_hashes = vec![0; strings1.len()]; |
859 | | create_hashes( |
860 | | &[Arc::clone(&dict_array) as ArrayRef], |
861 | | &random_state, |
862 | | &mut one_col_hashes, |
863 | | ) |
864 | | .unwrap(); |
865 | | |
866 | | let mut two_col_hashes = vec![0; strings1.len()]; |
867 | | create_hashes( |
868 | | &[dict_array, string_array], |
869 | | &random_state, |
870 | | &mut two_col_hashes, |
871 | | ) |
872 | | .unwrap(); |
873 | | |
874 | | assert_eq!(one_col_hashes.len(), 3); |
875 | | assert_eq!(two_col_hashes.len(), 3); |
876 | | |
877 | | assert_ne!(one_col_hashes, two_col_hashes); |
878 | | } |
879 | | } |