Skip to content

Commit

Permalink
Fix join order for TPCH Q17 & Q18 by improving FilterExec statistics (#…
Browse files Browse the repository at this point in the history
…8126)

* Assume filters are highly selective if we cannot truly estimate cardinality

* fix regression

* cargo fmt

* simplify code

* Update datafusion/physical-plan/src/filter.rs

Co-authored-by: Daniël Heres <[email protected]>

* add comment with link to follow on issue

* Use default of 20% selectivity

* trigger CI

* remove files

* trigger CI

* address feedback

---------

Co-authored-by: Daniël Heres <[email protected]>
  • Loading branch information
andygrove and Dandandan authored Nov 12, 2023
1 parent e642cc2 commit 6fe00ce
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions datafusion/physical-plan/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,23 @@ impl ExecutionPlan for FilterExec {
fn statistics(&self) -> Result<Statistics> {
let predicate = self.predicate();

let input_stats = self.input.statistics()?;
let schema = self.schema();
if !check_support(predicate, &schema) {
return Ok(Statistics::new_unknown(&schema));
// assume filter selects 20% of rows if we cannot do anything smarter
// tracking issue for making this configurable:
// https://github.com/apache/arrow-datafusion/issues/8133
let selectivity = 0.2_f32;
let mut stats = input_stats.clone().into_inexact();
if let Precision::Inexact(n) = stats.num_rows {
stats.num_rows = Precision::Inexact((selectivity * n as f32) as usize);
}
if let Precision::Inexact(n) = stats.total_byte_size {
stats.total_byte_size =
Precision::Inexact((selectivity * n as f32) as usize);
}
return Ok(stats);
}
let input_stats = self.input.statistics()?;

let num_rows = input_stats.num_rows;
let total_byte_size = input_stats.total_byte_size;
Expand Down

0 comments on commit 6fe00ce

Please sign in to comment.