Skip to content

Commit

Permalink
Support partial aggregation skip for boolean functions (#11847)
Browse files Browse the repository at this point in the history
* partial aggr for bool_*()

* Use null filter
  • Loading branch information
2010YOUY01 authored Aug 15, 2024
1 parent 06bcf33 commit 41f6dd9
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use std::sync::Arc;

use crate::aggregate::groups_accumulator::nulls::filtered_null_mask;
use arrow::array::{ArrayRef, AsArray, BooleanArray, BooleanBufferBuilder};
use arrow::buffer::BooleanBuffer;
use datafusion_common::Result;
Expand Down Expand Up @@ -135,4 +136,22 @@ where
// capacity is in bits, so convert to bytes
self.values.capacity() / 8 + self.null_state.size()
}

fn convert_to_state(
&self,
values: &[ArrayRef],
opt_filter: Option<&BooleanArray>,
) -> Result<Vec<ArrayRef>> {
let values = values[0].as_boolean().clone();

let values_null_buffer_filtered = filtered_null_mask(opt_filter, &values);
let (values_buf, _) = values.into_parts();
let values_filtered = BooleanArray::new(values_buf, values_null_buffer_filtered);

Ok(vec![Arc::new(values_filtered)])
}

fn supports_convert_to_state(&self) -> bool {
true
}
}
114 changes: 114 additions & 0 deletions datafusion/sqllogictest/test_files/aggregate_skip_partial.slt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,22 @@ STORED AS CSV
LOCATION '../../testing/data/csv/aggregate_test_100.csv'
OPTIONS ('format.has_header' 'true');

# Table to test `bool_and()`, `bool_or()` aggregate functions
statement ok
CREATE TABLE aggregate_test_100_bool (
v1 VARCHAR NOT NULL,
v2 BOOLEAN,
v3 BOOLEAN
);

statement ok
INSERT INTO aggregate_test_100_bool
SELECT
c1 as v1,
CASE WHEN c2 > 3 THEN TRUE WHEN c2 > 1 THEN FALSE ELSE NULL END as v2,
CASE WHEN c1='a' OR c1='b' THEN TRUE WHEN c1='c' OR c1='d' THEN FALSE ELSE NULL END as v3
FROM aggregate_test_100;

# Prepare settings to skip partial aggregation from the beginning
statement ok
set datafusion.execution.skip_partial_aggregation_probe_rows_threshold = 0;
Expand Down Expand Up @@ -117,6 +133,33 @@ GROUP BY 1, 2 ORDER BY 1 LIMIT 5;
-2117946883 d -2117946883 NULL NULL NULL
-2098805236 c -2098805236 NULL NULL NULL

# FIXME: add bool_and(v3) column when issue fixed
# ISSUE https://github.com/apache/datafusion/issues/11846
query TBBB rowsort
select v1, bool_or(v2), bool_and(v2), bool_or(v3)
from aggregate_test_100_bool
group by v1
----
a true false true
b true false true
c true false false
d true false false
e true false NULL

query TBBB rowsort
select v1,
bool_or(v2) FILTER (WHERE v1 = 'a' OR v1 = 'c' OR v1 = 'e'),
bool_or(v2) FILTER (WHERE v2 = false),
bool_or(v2) FILTER (WHERE v2 = NULL)
from aggregate_test_100_bool
group by v1
----
a true false NULL
b NULL false NULL
c true false NULL
d NULL false NULL
e true false NULL

# Prepare settings to always skip aggregation after couple of batches
statement ok
set datafusion.execution.skip_partial_aggregation_probe_rows_threshold = 10;
Expand Down Expand Up @@ -223,6 +266,32 @@ c 2.666666666667 0.425241138254
d 2.444444444444 0.541519476308
e 3 0.505440263521

# FIXME: add bool_and(v3) column when issue fixed
# ISSUE https://github.com/apache/datafusion/issues/11846
query TBBB rowsort
select v1, bool_or(v2), bool_and(v2), bool_or(v3)
from aggregate_test_100_bool
group by v1
----
a true false true
b true false true
c true false false
d true false false
e true false NULL

query TBBB rowsort
select v1,
bool_or(v2) FILTER (WHERE v1 = 'a' OR v1 = 'c' OR v1 = 'e'),
bool_or(v2) FILTER (WHERE v2 = false),
bool_or(v2) FILTER (WHERE v2 = NULL)
from aggregate_test_100_bool
group by v1
----
a true false NULL
b NULL false NULL
c true false NULL
d NULL false NULL
e true false NULL

# Enabling PG dialect for filtered aggregates tests
statement ok
Expand Down Expand Up @@ -377,3 +446,48 @@ ORDER BY i;

statement ok
DROP TABLE decimal_table;

# Extra tests for 'bool_*()' edge cases
statement ok
set datafusion.execution.skip_partial_aggregation_probe_rows_threshold = 0;

statement ok
set datafusion.execution.skip_partial_aggregation_probe_ratio_threshold = 0.0;

statement ok
set datafusion.execution.target_partitions = 1;

statement ok
set datafusion.execution.batch_size = 1;

statement ok
create table bool_aggregate_functions (
c1 boolean not null,
c2 boolean not null,
c3 boolean not null,
c4 boolean not null,
c5 boolean,
c6 boolean,
c7 boolean,
c8 boolean
)
as values
(true, true, false, false, true, true, null, null),
(true, false, true, false, false, null, false, null),
(true, true, false, false, null, true, false, null);

query BBBBBBBB
SELECT bool_and(c1), bool_and(c2), bool_and(c3), bool_and(c4), bool_and(c5), bool_and(c6), bool_and(c7), bool_and(c8) FROM bool_aggregate_functions
----
true false false false false true false NULL

statement ok
set datafusion.execution.skip_partial_aggregation_probe_rows_threshold = 2;

query BBBBBBBB
SELECT bool_and(c1), bool_and(c2), bool_and(c3), bool_and(c4), bool_and(c5), bool_and(c6), bool_and(c7), bool_and(c8) FROM bool_aggregate_functions
----
true false false false false true false NULL

statement ok
DROP TABLE aggregate_test_100_bool

0 comments on commit 41f6dd9

Please sign in to comment.