Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/common/src/hash_utils.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Functionality used both on logical and physical plans
19
20
#[cfg(not(feature = "force_hash_collisions"))]
21
use std::sync::Arc;
22
23
use ahash::RandomState;
24
use arrow::array::*;
25
use arrow::datatypes::*;
26
#[cfg(not(feature = "force_hash_collisions"))]
27
use arrow::{downcast_dictionary_array, downcast_primitive_array};
28
use arrow_buffer::IntervalDayTime;
29
use arrow_buffer::IntervalMonthDayNano;
30
31
#[cfg(not(feature = "force_hash_collisions"))]
32
use crate::cast::{
33
    as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
34
    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
35
    as_primitive_array, as_string_array, as_string_view_array, as_struct_array,
36
};
37
use crate::error::Result;
38
#[cfg(not(feature = "force_hash_collisions"))]
39
use crate::error::_internal_err;
40
41
// Combines two hashes into one hash
42
#[inline]
43
65.7k
pub fn combine_hashes(l: u64, r: u64) -> u64 {
44
65.7k
    let hash = (17 * 37u64).wrapping_add(l);
45
65.7k
    hash.wrapping_mul(37).wrapping_add(r)
46
65.7k
}
47
48
#[cfg(not(feature = "force_hash_collisions"))]
49
0
fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
50
0
    if mul_col {
51
0
        hashes_buffer.iter_mut().for_each(|hash| {
52
0
            // stable hash for null value
53
0
            *hash = combine_hashes(random_state.hash_one(1), *hash);
54
0
        })
55
    } else {
56
0
        hashes_buffer.iter_mut().for_each(|hash| {
57
0
            *hash = random_state.hash_one(1);
58
0
        })
59
    }
60
0
}
61
62
pub trait HashValue {
63
    fn hash_one(&self, state: &RandomState) -> u64;
64
}
65
66
impl<'a, T: HashValue + ?Sized> HashValue for &'a T {
67
20
    fn hash_one(&self, state: &RandomState) -> u64 {
68
20
        T::hash_one(self, state)
69
20
    }
70
}
71
72
macro_rules! hash_value {
73
    ($($t:ty),+) => {
74
        $(impl HashValue for $t {
75
134k
            fn hash_one(&self, state: &RandomState) -> u64 {
76
134k
                state.hash_one(self)
77
134k
            }
78
        })+
79
    };
80
}
81
hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
82
hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
83
84
macro_rules! hash_float_value {
85
    ($(($t:ty, $i:ty)),+) => {
86
        $(impl HashValue for $t {
87
65.7k
            fn hash_one(&self, state: &RandomState) -> u64 {
88
65.7k
                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
89
65.7k
            }
90
        })+
91
    };
92
}
93
hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
94
95
/// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
96
/// If `rehash==true` this combines the previous hash value in the buffer
97
/// with the new hash using `combine_hashes`
98
#[cfg(not(feature = "force_hash_collisions"))]
99
29.5k
fn hash_array_primitive<T>(
100
29.5k
    array: &PrimitiveArray<T>,
101
29.5k
    random_state: &RandomState,
102
29.5k
    hashes_buffer: &mut [u64],
103
29.5k
    rehash: bool,
104
29.5k
) where
105
29.5k
    T: ArrowPrimitiveType,
106
29.5k
    <T as arrow_array::ArrowPrimitiveType>::Native: HashValue,
107
29.5k
{
108
29.5k
    assert_eq!(
109
29.5k
        hashes_buffer.len(),
110
29.5k
        array.len(),
111
0
        "hashes_buffer and array should be of equal length"
112
    );
113
114
29.5k
    if array.null_count() == 0 {
115
29.4k
        if rehash {
116
65.7k
            for (hash, &value) in 
hashes_buffer.iter_mut().zip(array.values().iter())69
{
117
65.7k
                *hash = combine_hashes(value.hash_one(random_state), *hash);
118
65.7k
            }
119
        } else {
120
133k
            for (hash, &value) in 
hashes_buffer.iter_mut().zip(array.values().iter())29.4k
{
121
133k
                *hash = value.hash_one(random_state);
122
133k
            }
123
        }
124
72
    } else if rehash {
125
131k
        for (i, hash) in 
hashes_buffer.iter_mut().enumerate()36
{
126
131k
            if !array.is_null(i) {
127
20
                let value = unsafe { array.value_unchecked(i) };
128
20
                *hash = combine_hashes(value.hash_one(random_state), *hash);
129
131k
            }
130
        }
131
    } else {
132
65.6k
        for (i, hash) in 
hashes_buffer.iter_mut().enumerate()36
{
133
65.6k
            if !array.is_null(i) {
134
30
                let value = unsafe { array.value_unchecked(i) };
135
30
                *hash = value.hash_one(random_state);
136
65.6k
            }
137
        }
138
    }
139
29.5k
}
140
141
/// Hashes one array into the `hashes_buffer`
142
/// If `rehash==true` this combines the previous hash value in the buffer
143
/// with the new hash using `combine_hashes`
144
#[cfg(not(feature = "force_hash_collisions"))]
145
11
fn hash_array<T>(
146
11
    array: T,
147
11
    random_state: &RandomState,
148
11
    hashes_buffer: &mut [u64],
149
11
    rehash: bool,
150
11
) where
151
11
    T: ArrayAccessor,
152
11
    T::Item: HashValue,
153
11
{
154
11
    assert_eq!(
155
11
        hashes_buffer.len(),
156
11
        array.len(),
157
0
        "hashes_buffer and array should be of equal length"
158
    );
159
160
11
    if array.null_count() == 0 {
161
11
        if rehash {
162
0
            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
163
0
                let value = unsafe { array.value_unchecked(i) };
164
0
                *hash = combine_hashes(value.hash_one(random_state), *hash);
165
0
            }
166
        } else {
167
20
            for (i, hash) in 
hashes_buffer.iter_mut().enumerate()11
{
168
20
                let value = unsafe { array.value_unchecked(i) };
169
20
                *hash = value.hash_one(random_state);
170
20
            }
171
        }
172
0
    } else if rehash {
173
0
        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
174
0
            if !array.is_null(i) {
175
0
                let value = unsafe { array.value_unchecked(i) };
176
0
                *hash = combine_hashes(value.hash_one(random_state), *hash);
177
0
            }
178
        }
179
    } else {
180
0
        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
181
0
            if !array.is_null(i) {
182
0
                let value = unsafe { array.value_unchecked(i) };
183
0
                *hash = value.hash_one(random_state);
184
0
            }
185
        }
186
    }
187
11
}
188
189
/// Hash the values in a dictionary array
190
#[cfg(not(feature = "force_hash_collisions"))]
191
2
fn hash_dictionary<K: ArrowDictionaryKeyType>(
192
2
    array: &DictionaryArray<K>,
193
2
    random_state: &RandomState,
194
2
    hashes_buffer: &mut [u64],
195
2
    multi_col: bool,
196
2
) -> Result<()> {
197
2
    // Hash each dictionary value once, and then use that computed
198
2
    // hash for each key value to avoid a potentially expensive
199
2
    // redundant hashing for large dictionary elements (e.g. strings)
200
2
    let values = Arc::clone(array.values());
201
2
    let mut dict_hashes = vec![0; values.len()];
202
2
    create_hashes(&[values], random_state, &mut dict_hashes)
?0
;
203
204
    // combine hash for each index in values
205
2
    if multi_col {
206
3
        for (hash, key) in 
hashes_buffer.iter_mut().zip(array.keys().iter())1
{
207
3
            if let Some(key) = key {
208
3
                *hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
209
0
            } // no update for Null, consistent with other hashes
210
        }
211
    } else {
212
3
        for (hash, key) in 
hashes_buffer.iter_mut().zip(array.keys().iter())1
{
213
3
            if let Some(
key2
) = key {
214
2
                *hash = dict_hashes[key.as_usize()]
215
1
            } // no update for Null, consistent with other hashes
216
        }
217
    }
218
2
    Ok(())
219
2
}
220
221
#[cfg(not(feature = "force_hash_collisions"))]
222
7
fn hash_struct_array(
223
7
    array: &StructArray,
224
7
    random_state: &RandomState,
225
7
    hashes_buffer: &mut [u64],
226
7
) -> Result<()> {
227
7
    let nulls = array.nulls();
228
7
    let row_len = array.len();
229
230
7
    let valid_row_indices: Vec<usize> = if let Some(
nulls4
) = nulls {
231
4
        nulls.valid_indices().collect()
232
    } else {
233
3
        (0..row_len).collect()
234
    };
235
236
    // Create hashes for each row that combines the hashes over all the column at that row.
237
7
    let mut values_hashes = vec![0u64; row_len];
238
7
    create_hashes(array.columns(), random_state, &mut values_hashes)
?0
;
239
240
18
    for 
i11
in valid_row_indices {
241
11
        let hash = &mut hashes_buffer[i];
242
11
        *hash = combine_hashes(*hash, values_hashes[i]);
243
11
    }
244
245
7
    Ok(())
246
7
}
247
248
// only adding this `cfg` b/c this function is only used with this `cfg`
249
#[cfg(not(feature = "force_hash_collisions"))]
250
0
fn hash_map_array(
251
0
    array: &MapArray,
252
0
    random_state: &RandomState,
253
0
    hashes_buffer: &mut [u64],
254
0
) -> Result<()> {
255
0
    let nulls = array.nulls();
256
0
    let offsets = array.offsets();
257
0
258
0
    // Create hashes for each entry in each row
259
0
    let mut values_hashes = vec![0u64; array.entries().len()];
260
0
    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
261
262
    // Combine the hashes for entries on each row with each other and previous hash for that row
263
0
    if let Some(nulls) = nulls {
264
0
        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
265
0
            if nulls.is_valid(i) {
266
0
                let hash = &mut hashes_buffer[i];
267
0
                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
268
0
                    *hash = combine_hashes(*hash, *values_hash);
269
0
                }
270
0
            }
271
        }
272
    } else {
273
0
        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
274
0
            let hash = &mut hashes_buffer[i];
275
0
            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
276
0
                *hash = combine_hashes(*hash, *values_hash);
277
0
            }
278
        }
279
    }
280
281
0
    Ok(())
282
0
}
283
284
#[cfg(not(feature = "force_hash_collisions"))]
285
0
fn hash_list_array<OffsetSize>(
286
0
    array: &GenericListArray<OffsetSize>,
287
0
    random_state: &RandomState,
288
0
    hashes_buffer: &mut [u64],
289
0
) -> Result<()>
290
0
where
291
0
    OffsetSize: OffsetSizeTrait,
292
0
{
293
0
    let values = Arc::clone(array.values());
294
0
    let offsets = array.value_offsets();
295
0
    let nulls = array.nulls();
296
0
    let mut values_hashes = vec![0u64; values.len()];
297
0
    create_hashes(&[values], random_state, &mut values_hashes)?;
298
0
    if let Some(nulls) = nulls {
299
0
        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
300
0
            if nulls.is_valid(i) {
301
0
                let hash = &mut hashes_buffer[i];
302
0
                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
303
0
                    *hash = combine_hashes(*hash, *values_hash);
304
0
                }
305
0
            }
306
        }
307
    } else {
308
0
        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
309
0
            let hash = &mut hashes_buffer[i];
310
0
            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
311
0
                *hash = combine_hashes(*hash, *values_hash);
312
0
            }
313
        }
314
    }
315
0
    Ok(())
316
0
}
317
318
#[cfg(not(feature = "force_hash_collisions"))]
319
0
fn hash_fixed_list_array(
320
0
    array: &FixedSizeListArray,
321
0
    random_state: &RandomState,
322
0
    hashes_buffer: &mut [u64],
323
0
) -> Result<()> {
324
0
    let values = Arc::clone(array.values());
325
0
    let value_len = array.value_length();
326
0
    let offset_size = value_len as usize / array.len();
327
0
    let nulls = array.nulls();
328
0
    let mut values_hashes = vec![0u64; values.len()];
329
0
    create_hashes(&[values], random_state, &mut values_hashes)?;
330
0
    if let Some(nulls) = nulls {
331
0
        for i in 0..array.len() {
332
0
            if nulls.is_valid(i) {
333
0
                let hash = &mut hashes_buffer[i];
334
0
                for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size]
335
0
                {
336
0
                    *hash = combine_hashes(*hash, *values_hash);
337
0
                }
338
0
            }
339
        }
340
    } else {
341
0
        for i in 0..array.len() {
342
0
            let hash = &mut hashes_buffer[i];
343
0
            for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] {
344
0
                *hash = combine_hashes(*hash, *values_hash);
345
0
            }
346
        }
347
    }
348
0
    Ok(())
349
0
}
350
351
/// Test version of `create_hashes` that produces the same value for
352
/// all hashes (to test collisions)
353
///
354
/// See comments on `hashes_buffer` for more details
355
#[cfg(feature = "force_hash_collisions")]
356
pub fn create_hashes<'a>(
357
    _arrays: &[ArrayRef],
358
    _random_state: &RandomState,
359
    hashes_buffer: &'a mut Vec<u64>,
360
) -> Result<&'a mut Vec<u64>> {
361
    for hash in hashes_buffer.iter_mut() {
362
        *hash = 0
363
    }
364
    Ok(hashes_buffer)
365
}
366
367
/// Creates hash values for every row, based on the values in the
368
/// columns.
369
///
370
/// The number of rows to hash is determined by `hashes_buffer.len()`.
371
/// `hashes_buffer` should be pre-sized appropriately
372
#[cfg(not(feature = "force_hash_collisions"))]
373
29.4k
pub fn create_hashes<'a>(
374
29.4k
    arrays: &[ArrayRef],
375
29.4k
    random_state: &RandomState,
376
29.4k
    hashes_buffer: &'a mut Vec<u64>,
377
29.4k
) -> Result<&'a mut Vec<u64>> {
378
29.5k
    for (i, col) in 
arrays.iter().enumerate()29.4k
{
379
29.5k
        let array = col.as_ref();
380
29.5k
        // combine hashes with `combine_hashes` for all columns besides the first
381
29.5k
        let rehash = i >= 1;
382
29.5k
        downcast_primitive_array! {
383
0
            array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
384
0
            DataType::Null => hash_null(random_state, hashes_buffer, rehash),
385
0
            DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
386
11
            DataType::Utf8 => hash_array(as_string_array(array)
?0
, random_state, hashes_buffer, rehash),
387
0
            DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
388
0
            DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
389
0
            DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
390
0
            DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
391
0
            DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
392
            DataType::FixedSizeBinary(_) => {
393
0
                let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
394
0
                hash_array(array, random_state, hashes_buffer, rehash)
395
            }
396
            DataType::Decimal128(_, _) => {
397
0
                let array = as_primitive_array::<Decimal128Type>(array)?;
398
0
                hash_array_primitive(array, random_state, hashes_buffer, rehash)
399
            }
400
            DataType::Decimal256(_, _) => {
401
0
                let array = as_primitive_array::<Decimal256Type>(array)?;
402
0
                hash_array_primitive(array, random_state, hashes_buffer, rehash)
403
            }
404
2
            DataType::Dictionary(_, _) => downcast_dictionary_array! {
405
0
                array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
406
0
                _ => unreachable!()
407
            }
408
            DataType::Struct(_) => {
409
7
                let array = as_struct_array(array)
?0
;
410
7
                hash_struct_array(array, random_state, hashes_buffer)
?0
;
411
            }
412
            DataType::List(_) => {
413
0
                let array = as_list_array(array)?;
414
0
                hash_list_array(array, random_state, hashes_buffer)?;
415
            }
416
            DataType::LargeList(_) => {
417
0
                let array = as_large_list_array(array)?;
418
0
                hash_list_array(array, random_state, hashes_buffer)?;
419
            }
420
            DataType::Map(_, _) => {
421
0
                let array = as_map_array(array)?;
422
0
                hash_map_array(array, random_state, hashes_buffer)?;
423
            }
424
            DataType::FixedSizeList(_,_) => {
425
0
                let array = as_fixed_size_list_array(array)?;
426
0
                hash_fixed_list_array(array, random_state, hashes_buffer)?;
427
            }
428
            _ => {
429
                // This is internal because we should have caught this before.
430
0
                return _internal_err!(
431
0
                    "Unsupported data type in hasher: {}",
432
0
                    col.data_type()
433
0
                );
434
            }
435
        }
436
    }
437
29.4k
    Ok(hashes_buffer)
438
29.4k
}
439
440
#[cfg(test)]
441
mod tests {
442
    use std::sync::Arc;
443
444
    use arrow::array::*;
445
    #[cfg(not(feature = "force_hash_collisions"))]
446
    use arrow::datatypes::*;
447
448
    use super::*;
449
450
    #[test]
451
    fn create_hashes_for_decimal_array() -> Result<()> {
452
        let array = vec![1, 2, 3, 4]
453
            .into_iter()
454
            .map(Some)
455
            .collect::<Decimal128Array>()
456
            .with_precision_and_scale(20, 3)
457
            .unwrap();
458
        let array_ref = Arc::new(array);
459
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
460
        let hashes_buff = &mut vec![0; array_ref.len()];
461
        let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
462
        assert_eq!(hashes.len(), 4);
463
        Ok(())
464
    }
465
466
    #[test]
467
    fn create_hashes_for_float_arrays() -> Result<()> {
468
        let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
469
        let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
470
471
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
472
        let hashes_buff = &mut vec![0; f32_arr.len()];
473
        let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
474
        assert_eq!(hashes.len(), 4,);
475
476
        let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
477
        assert_eq!(hashes.len(), 4,);
478
479
        Ok(())
480
    }
481
482
    macro_rules! create_hash_binary {
483
        ($NAME:ident, $ARRAY:ty) => {
484
            #[cfg(not(feature = "force_hash_collisions"))]
485
            #[test]
486
            fn $NAME() {
487
                let binary = [
488
                    Some(b"short".to_byte_slice()),
489
                    None,
490
                    Some(b"long but different 12 bytes string"),
491
                    Some(b"short2"),
492
                    Some(b"Longer than 12 bytes string"),
493
                    Some(b"short"),
494
                    Some(b"Longer than 12 bytes string"),
495
                ];
496
497
                let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
498
                let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
499
500
                let random_state = RandomState::with_seeds(0, 0, 0, 0);
501
502
                let mut binary_hashes = vec![0; binary.len()];
503
                create_hashes(&[binary_array], &random_state, &mut binary_hashes)
504
                    .unwrap();
505
506
                let mut ref_hashes = vec![0; binary.len()];
507
                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
508
509
                // Null values result in a zero hash,
510
                for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
511
                    match val {
512
                        Some(_) => assert_ne!(*hash, 0),
513
                        None => assert_eq!(*hash, 0),
514
                    }
515
                }
516
517
                // same logical values should hash to the same hash value
518
                assert_eq!(binary_hashes, ref_hashes);
519
520
                // Same values should map to same hash values
521
                assert_eq!(binary[0], binary[5]);
522
                assert_eq!(binary[4], binary[6]);
523
524
                // different binary should map to different hash values
525
                assert_ne!(binary[0], binary[2]);
526
            }
527
        };
528
    }
529
530
    create_hash_binary!(binary_array, BinaryArray);
531
    create_hash_binary!(binary_view_array, BinaryViewArray);
532
533
    #[test]
534
    fn create_hashes_fixed_size_binary() -> Result<()> {
535
        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
536
        let fixed_size_binary_array =
537
            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
538
539
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
540
        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
541
        let hashes =
542
            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
543
        assert_eq!(hashes.len(), 3,);
544
545
        Ok(())
546
    }
547
548
    macro_rules! create_hash_string {
549
        ($NAME:ident, $ARRAY:ty) => {
550
            #[cfg(not(feature = "force_hash_collisions"))]
551
            #[test]
552
            fn $NAME() {
553
                let strings = [
554
                    Some("short"),
555
                    None,
556
                    Some("long but different 12 bytes string"),
557
                    Some("short2"),
558
                    Some("Longer than 12 bytes string"),
559
                    Some("short"),
560
                    Some("Longer than 12 bytes string"),
561
                ];
562
563
                let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
564
                let dict_array = Arc::new(
565
                    strings
566
                        .iter()
567
                        .cloned()
568
                        .collect::<DictionaryArray<Int8Type>>(),
569
                );
570
571
                let random_state = RandomState::with_seeds(0, 0, 0, 0);
572
573
                let mut string_hashes = vec![0; strings.len()];
574
                create_hashes(&[string_array], &random_state, &mut string_hashes)
575
                    .unwrap();
576
577
                let mut dict_hashes = vec![0; strings.len()];
578
                create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
579
580
                // Null values result in a zero hash,
581
                for (val, hash) in strings.iter().zip(string_hashes.iter()) {
582
                    match val {
583
                        Some(_) => assert_ne!(*hash, 0),
584
                        None => assert_eq!(*hash, 0),
585
                    }
586
                }
587
588
                // same logical values should hash to the same hash value
589
                assert_eq!(string_hashes, dict_hashes);
590
591
                // Same values should map to same hash values
592
                assert_eq!(strings[0], strings[5]);
593
                assert_eq!(strings[4], strings[6]);
594
595
                // different strings should map to different hash values
596
                assert_ne!(strings[0], strings[2]);
597
            }
598
        };
599
    }
600
601
    create_hash_string!(string_array, StringArray);
602
    create_hash_string!(large_string_array, LargeStringArray);
603
    create_hash_string!(string_view_array, StringArray);
604
    create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
605
606
    #[test]
607
    // Tests actual values of hashes, which are different if forcing collisions
608
    #[cfg(not(feature = "force_hash_collisions"))]
609
    fn create_hashes_for_dict_arrays() {
610
        let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
611
612
        let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
613
        let dict_array = Arc::new(
614
            strings
615
                .iter()
616
                .cloned()
617
                .collect::<DictionaryArray<Int8Type>>(),
618
        );
619
620
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
621
622
        let mut string_hashes = vec![0; strings.len()];
623
        create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
624
625
        let mut dict_hashes = vec![0; strings.len()];
626
        create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
627
628
        // Null values result in a zero hash,
629
        for (val, hash) in strings.iter().zip(string_hashes.iter()) {
630
            match val {
631
                Some(_) => assert_ne!(*hash, 0),
632
                None => assert_eq!(*hash, 0),
633
            }
634
        }
635
636
        // same logical values should hash to the same hash value
637
        assert_eq!(string_hashes, dict_hashes);
638
639
        // Same values should map to same hash values
640
        assert_eq!(strings[1], strings[4]);
641
        assert_eq!(dict_hashes[1], dict_hashes[4]);
642
        assert_eq!(strings[0], strings[3]);
643
        assert_eq!(dict_hashes[0], dict_hashes[3]);
644
645
        // different strings should map to different hash values
646
        assert_ne!(strings[0], strings[2]);
647
        assert_ne!(dict_hashes[0], dict_hashes[2]);
648
    }
649
650
    #[test]
651
    // Tests actual values of hashes, which are different if forcing collisions
652
    #[cfg(not(feature = "force_hash_collisions"))]
653
    fn create_hashes_for_list_arrays() {
654
        let data = vec![
655
            Some(vec![Some(0), Some(1), Some(2)]),
656
            None,
657
            Some(vec![Some(3), None, Some(5)]),
658
            Some(vec![Some(3), None, Some(5)]),
659
            None,
660
            Some(vec![Some(0), Some(1), Some(2)]),
661
            Some(vec![]),
662
        ];
663
        let list_array =
664
            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
665
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
666
        let mut hashes = vec![0; list_array.len()];
667
        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
668
        assert_eq!(hashes[0], hashes[5]);
669
        assert_eq!(hashes[1], hashes[4]);
670
        assert_eq!(hashes[2], hashes[3]);
671
        assert_eq!(hashes[1], hashes[6]); // null vs empty list
672
    }
673
674
    #[test]
675
    // Tests actual values of hashes, which are different if forcing collisions
676
    #[cfg(not(feature = "force_hash_collisions"))]
677
    fn create_hashes_for_fixed_size_list_arrays() {
678
        let data = vec![
679
            Some(vec![Some(0), Some(1), Some(2)]),
680
            None,
681
            Some(vec![Some(3), None, Some(5)]),
682
            Some(vec![Some(3), None, Some(5)]),
683
            None,
684
            Some(vec![Some(0), Some(1), Some(2)]),
685
        ];
686
        let list_array =
687
            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
688
                data, 3,
689
            )) as ArrayRef;
690
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
691
        let mut hashes = vec![0; list_array.len()];
692
        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
693
        assert_eq!(hashes[0], hashes[5]);
694
        assert_eq!(hashes[1], hashes[4]);
695
        assert_eq!(hashes[2], hashes[3]);
696
    }
697
698
    #[test]
699
    // Tests actual values of hashes, which are different if forcing collisions
700
    #[cfg(not(feature = "force_hash_collisions"))]
701
    fn create_hashes_for_struct_arrays() {
702
        use arrow_buffer::Buffer;
703
704
        let boolarr = Arc::new(BooleanArray::from(vec![
705
            false, false, true, true, true, true,
706
        ]));
707
        let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
708
709
        let struct_array = StructArray::from((
710
            vec![
711
                (
712
                    Arc::new(Field::new("bool", DataType::Boolean, false)),
713
                    Arc::clone(&boolarr) as ArrayRef,
714
                ),
715
                (
716
                    Arc::new(Field::new("i32", DataType::Int32, false)),
717
                    Arc::clone(&i32arr) as ArrayRef,
718
                ),
719
                (
720
                    Arc::new(Field::new("i32", DataType::Int32, false)),
721
                    Arc::clone(&i32arr) as ArrayRef,
722
                ),
723
                (
724
                    Arc::new(Field::new("bool", DataType::Boolean, false)),
725
                    Arc::clone(&boolarr) as ArrayRef,
726
                ),
727
            ],
728
            Buffer::from(&[0b001011]),
729
        ));
730
731
        assert!(struct_array.is_valid(0));
732
        assert!(struct_array.is_valid(1));
733
        assert!(struct_array.is_null(2));
734
        assert!(struct_array.is_valid(3));
735
        assert!(struct_array.is_null(4));
736
        assert!(struct_array.is_null(5));
737
738
        let array = Arc::new(struct_array) as ArrayRef;
739
740
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
741
        let mut hashes = vec![0; array.len()];
742
        create_hashes(&[array], &random_state, &mut hashes).unwrap();
743
        assert_eq!(hashes[0], hashes[1]);
744
        // same value but the third row ( hashes[2] ) is null
745
        assert_ne!(hashes[2], hashes[3]);
746
        // different values but both are null
747
        assert_eq!(hashes[4], hashes[5]);
748
    }
749
750
    #[test]
751
    // Tests actual values of hashes, which are different if forcing collisions
752
    #[cfg(not(feature = "force_hash_collisions"))]
753
    fn create_hashes_for_struct_arrays_more_column_than_row() {
754
        let struct_array = StructArray::from(vec![
755
            (
756
                Arc::new(Field::new("bool", DataType::Boolean, false)),
757
                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
758
            ),
759
            (
760
                Arc::new(Field::new("i32-1", DataType::Int32, false)),
761
                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
762
            ),
763
            (
764
                Arc::new(Field::new("i32-2", DataType::Int32, false)),
765
                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
766
            ),
767
            (
768
                Arc::new(Field::new("i32-3", DataType::Int32, false)),
769
                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
770
            ),
771
        ]);
772
773
        assert!(struct_array.is_valid(0));
774
        assert!(struct_array.is_valid(1));
775
776
        let array = Arc::new(struct_array) as ArrayRef;
777
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
778
        let mut hashes = vec![0; array.len()];
779
        create_hashes(&[array], &random_state, &mut hashes).unwrap();
780
        assert_eq!(hashes[0], hashes[1]);
781
    }
782
783
    #[test]
784
    // Tests actual values of hashes, which are different if forcing collisions
785
    #[cfg(not(feature = "force_hash_collisions"))]
786
    fn create_hashes_for_map_arrays() {
787
        let mut builder =
788
            MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
789
        // Row 0
790
        builder.keys().append_value("key1");
791
        builder.keys().append_value("key2");
792
        builder.values().append_value(1);
793
        builder.values().append_value(2);
794
        builder.append(true).unwrap();
795
        // Row 1
796
        builder.keys().append_value("key1");
797
        builder.keys().append_value("key2");
798
        builder.values().append_value(1);
799
        builder.values().append_value(2);
800
        builder.append(true).unwrap();
801
        // Row 2
802
        builder.keys().append_value("key1");
803
        builder.keys().append_value("key2");
804
        builder.values().append_value(1);
805
        builder.values().append_value(3);
806
        builder.append(true).unwrap();
807
        // Row 3
808
        builder.keys().append_value("key1");
809
        builder.keys().append_value("key3");
810
        builder.values().append_value(1);
811
        builder.values().append_value(2);
812
        builder.append(true).unwrap();
813
        // Row 4
814
        builder.keys().append_value("key1");
815
        builder.values().append_value(1);
816
        builder.append(true).unwrap();
817
        // Row 5
818
        builder.keys().append_value("key1");
819
        builder.values().append_null();
820
        builder.append(true).unwrap();
821
        // Row 6
822
        builder.append(true).unwrap();
823
        // Row 7
824
        builder.keys().append_value("key1");
825
        builder.values().append_value(1);
826
        builder.append(false).unwrap();
827
828
        let array = Arc::new(builder.finish()) as ArrayRef;
829
830
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
831
        let mut hashes = vec![0; array.len()];
832
        create_hashes(&[array], &random_state, &mut hashes).unwrap();
833
        assert_eq!(hashes[0], hashes[1]); // same value
834
        assert_ne!(hashes[0], hashes[2]); // different value
835
        assert_ne!(hashes[0], hashes[3]); // different key
836
        assert_ne!(hashes[0], hashes[4]); // missing an entry
837
        assert_ne!(hashes[4], hashes[5]); // filled vs null value
838
        assert_eq!(hashes[6], hashes[7]); // empty vs null map
839
    }
840
841
    #[test]
842
    // Tests actual values of hashes, which are different if forcing collisions
843
    #[cfg(not(feature = "force_hash_collisions"))]
844
    fn create_multi_column_hash_for_dict_arrays() {
845
        let strings1 = [Some("foo"), None, Some("bar")];
846
        let strings2 = [Some("blarg"), Some("blah"), None];
847
848
        let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
849
        let dict_array = Arc::new(
850
            strings2
851
                .iter()
852
                .cloned()
853
                .collect::<DictionaryArray<Int32Type>>(),
854
        );
855
856
        let random_state = RandomState::with_seeds(0, 0, 0, 0);
857
858
        let mut one_col_hashes = vec![0; strings1.len()];
859
        create_hashes(
860
            &[Arc::clone(&dict_array) as ArrayRef],
861
            &random_state,
862
            &mut one_col_hashes,
863
        )
864
        .unwrap();
865
866
        let mut two_col_hashes = vec![0; strings1.len()];
867
        create_hashes(
868
            &[dict_array, string_array],
869
            &random_state,
870
            &mut two_col_hashes,
871
        )
872
        .unwrap();
873
874
        assert_eq!(one_col_hashes.len(), 3);
875
        assert_eq!(two_col_hashes.len(), 3);
876
877
        assert_ne!(one_col_hashes, two_col_hashes);
878
    }
879
}