Skip to content

Commit

Permalink
Enhance/Refactor Ordering Equivalence Properties (#7566)
Browse files Browse the repository at this point in the history
* separate implementation of oeq properties

* Simplifications

* Move utils to methods

* Remove unnecesary code

* Address todo

* Buggy is_aggressive mod eklenecek

* start implementing aggressive mode

* all tests pass

* minor changes

* All tests pass

* Minor changes

* All tests pass

* minor changes

* all tests pass

* Simplifications

* minor changes

* Resolve linter error

* Minor changes

* minor changes

* Update plan

* Simplifications, update comments

* Update comments, Use existing stats to find constants

* Simplifications

* Unknown input stats are handled

* Address reviews

* Simplifications

* Simplifications

* Address reviews

* Fix subdirectories

---------

Co-authored-by: berkaysynnada <[email protected]>
  • Loading branch information
mustafasrepo and berkaysynnada authored Sep 18, 2023
1 parent 678d27a commit c72b98e
Show file tree
Hide file tree
Showing 14 changed files with 952 additions and 586 deletions.
24 changes: 24 additions & 0 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
use std::fmt::Display;

use arrow::datatypes::DataType;

use crate::ScalarValue;

/// Statistics for a relation
Expand Down Expand Up @@ -70,3 +72,25 @@ pub struct ColumnStatistics {
/// Number of distinct values
pub distinct_count: Option<usize>,
}

impl ColumnStatistics {
/// Column contains a single non null value (e.g constant).
pub fn is_singleton(&self) -> bool {
match (&self.min_value, &self.max_value) {
// Min and max values are the same and not infinity.
(Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max),
(_, _) => false,
}
}

/// Returns the [`ColumnStatistics`] corresponding to the given datatype by assigning infinite bounds.
pub fn new_with_unbounded_column(dt: &DataType) -> ColumnStatistics {
let null = ScalarValue::try_from(dt.clone()).ok();
ColumnStatistics {
null_count: None,
max_value: null.clone(),
min_value: null,
distinct_count: None,
}
}
}
24 changes: 4 additions & 20 deletions datafusion/core/src/physical_optimizer/enforce_distribution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ use datafusion_physical_expr::utils::{
map_columns_before_projection, ordering_satisfy_requirement_concrete,
};
use datafusion_physical_expr::{
expr_list_eq_strict_order, normalize_expr_with_equivalence_properties, PhysicalExpr,
PhysicalSortRequirement,
expr_list_eq_strict_order, PhysicalExpr, PhysicalSortRequirement,
};

use datafusion_common::internal_err;
Expand Down Expand Up @@ -807,36 +806,21 @@ fn try_reorder(
} else if !equivalence_properties.classes().is_empty() {
normalized_expected = expected
.iter()
.map(|e| {
normalize_expr_with_equivalence_properties(
e.clone(),
equivalence_properties.classes(),
)
})
.map(|e| equivalence_properties.normalize_expr(e.clone()))
.collect::<Vec<_>>();
assert_eq!(normalized_expected.len(), expected.len());

normalized_left_keys = join_keys
.left_keys
.iter()
.map(|e| {
normalize_expr_with_equivalence_properties(
e.clone(),
equivalence_properties.classes(),
)
})
.map(|e| equivalence_properties.normalize_expr(e.clone()))
.collect::<Vec<_>>();
assert_eq!(join_keys.left_keys.len(), normalized_left_keys.len());

normalized_right_keys = join_keys
.right_keys
.iter()
.map(|e| {
normalize_expr_with_equivalence_properties(
e.clone(),
equivalence_properties.classes(),
)
})
.map(|e| equivalence_properties.normalize_expr(e.clone()))
.collect::<Vec<_>>();
assert_eq!(join_keys.right_keys.len(), normalized_right_keys.len());

Expand Down
5 changes: 4 additions & 1 deletion datafusion/physical-expr/src/analysis.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,15 @@ fn shrink_boundaries(
})?;
let final_result = graph.get_interval(*root_index);

// If during selectivity calculation we encounter an error, use 1.0 as cardinality estimate
// safest estimate(e.q largest possible value).
let selectivity = calculate_selectivity(
&final_result.lower.value,
&final_result.upper.value,
&target_boundaries,
&initial_boundaries,
)?;
)
.unwrap_or(1.0);

if !(0.0..=1.0).contains(&selectivity) {
return internal_err!("Selectivity is out of limit: {}", selectivity);
Expand Down
Loading

0 comments on commit c72b98e

Please sign in to comment.