Skip to content

Commit

Permalink
Removes Bloom filter for Int8/Int16/Uint8/Uint16 (#9969)
Browse files Browse the repository at this point in the history
* Removing broken tests

* Simplifying tests / removing support for failed tests

* Revert "Simplifying tests / removing support for failed tests"

This reverts commit 6e50a80.

* Fixing tests for real

* Apply suggestions from code review

Thanks @alamb !

Co-authored-by: Andrew Lamb <[email protected]>

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
edmondop and alamb authored Apr 8, 2024
1 parent fc29c3e commit 820843f
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,8 @@ impl PruningStatistics for BloomFilterStatistics {
ScalarValue::Float32(Some(v)) => sbbf.check(v),
ScalarValue::Int64(Some(v)) => sbbf.check(v),
ScalarValue::Int32(Some(v)) => sbbf.check(v),
ScalarValue::Int16(Some(v)) => sbbf.check(v),
ScalarValue::Int8(Some(v)) => sbbf.check(v),
ScalarValue::UInt64(Some(v)) => sbbf.check(v),
ScalarValue::UInt32(Some(v)) => sbbf.check(v),
ScalarValue::UInt16(Some(v)) => sbbf.check(v),
ScalarValue::UInt8(Some(v)) => sbbf.check(v),
ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
Type::INT32 => {
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
Expand Down
54 changes: 26 additions & 28 deletions datafusion/core/tests/parquet/row_group_pruning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ async fn prune_disabled() {
// https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass
// if and only if Bloom filters on Int8 and Int16 columns are still buggy.
macro_rules! int_tests {
($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
($bits:expr) => {
paste::item! {
#[tokio::test]
async fn [<prune_int $bits _lt >]() {
Expand Down Expand Up @@ -329,9 +329,9 @@ macro_rules! int_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand All @@ -343,9 +343,9 @@ macro_rules! int_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand Down Expand Up @@ -404,9 +404,9 @@ macro_rules! int_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand Down Expand Up @@ -447,17 +447,16 @@ macro_rules! int_tests {
};
}

int_tests!(8, correct_bloom_filters: false);
int_tests!(16, correct_bloom_filters: false);
int_tests!(32, correct_bloom_filters: true);
int_tests!(64, correct_bloom_filters: true);
// int8/int16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779
int_tests!(32);
int_tests!(64);

// $bits: number of bits of the integer to test (8, 16, 32, 64)
// $correct_bloom_filters: if false, replicates the
// https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass
// if and only if Bloom filters on UInt8 and UInt16 columns are still buggy.
macro_rules! uint_tests {
($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
($bits:expr) => {
paste::item! {
#[tokio::test]
async fn [<prune_uint $bits _lt >]() {
Expand All @@ -482,9 +481,9 @@ macro_rules! uint_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand All @@ -496,9 +495,9 @@ macro_rules! uint_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand Down Expand Up @@ -542,9 +541,9 @@ macro_rules! uint_tests {
.with_expected_errors(Some(0))
.with_matched_by_stats(Some(1))
.with_pruned_by_stats(Some(3))
.with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 }))
.with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 }))
.with_expected_rows(if $correct_bloom_filters { 1 } else { 0 })
.with_matched_by_bloom_filter(Some(1))
.with_pruned_by_bloom_filter(Some(0))
.with_expected_rows(1)
.test_row_group_prune()
.await;
}
Expand Down Expand Up @@ -585,10 +584,9 @@ macro_rules! uint_tests {
};
}

uint_tests!(8, correct_bloom_filters: false);
uint_tests!(16, correct_bloom_filters: false);
uint_tests!(32, correct_bloom_filters: true);
uint_tests!(64, correct_bloom_filters: true);
// uint8/uint16 are incorrect: https://github.com/apache/arrow-datafusion/issues/9779
uint_tests!(32);
uint_tests!(64);

#[tokio::test]
async fn prune_int32_eq_large_in_list() {
Expand Down

0 comments on commit 820843f

Please sign in to comment.