From b8fb3cfb338a12a54424b931610c865125a516d5 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Dec 2023 06:45:47 -0500 Subject: [PATCH 01/12] Introduce LiteralGurantee to find col=const --- .../physical-expr/src/utils/guarantee.rs | 507 ++++++++++++++++++ .../src/{utils.rs => utils/mod.rs} | 33 +- 2 files changed, 527 insertions(+), 13 deletions(-) create mode 100644 datafusion/physical-expr/src/utils/guarantee.rs rename datafusion/physical-expr/src/{utils.rs => utils/mod.rs} (96%) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs new file mode 100644 index 000000000000..59266f0cc96e --- /dev/null +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -0,0 +1,507 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`LiteralGuarantee`] predicate analysis to determine if a column is a +//! constant. + +use crate::utils::split_disjunction; +use crate::{split_conjunction, PhysicalExpr}; +use datafusion_common::{Column, ScalarValue}; +use datafusion_expr::Operator; +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +/// Represents a known guarantee that a column is either one of a set of values +/// or not one of a set of values. +/// +/// `LiteralGuarantee`s can be used to simplify expressions and skip data files +/// (or row groups in parquet files). For example, if we know that `a = 1` then +/// we can skip any partition that does not contain `a = 1`. +/// +/// See [`LiteralGuarantee::analyze`] to extract literal guarantees from a +/// predicate. +/// +/// # Details +/// A guarantee can be one of two forms: +/// +/// 1. One of particular set of values. For example, `(a = 1)`, `(a = 1 OR a = +/// 2) or `a IN (1, 2, 3)` +/// +/// 2. NOT one of a particular set of values. For example, `(a != 1)`, `(a != 1 +/// AND a != 2)` or `a NOT IN (1, 2, 3)` +/// +#[derive(Debug, Clone, PartialEq)] +pub struct LiteralGuarantee { + pub column: Column, + pub guarantee: Guarantee, + pub literals: HashSet, +} + +/// What is be guaranteed about the values? +#[derive(Debug, Clone, PartialEq)] +pub enum Guarantee { + /// `column` is one of a set of constant values + In, + /// `column` is NOT one of a set of constant values + NotIn, +} + +impl LiteralGuarantee { + /// Create a new instance of the guarantee if the provided operator is + /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to + /// create these structures from an expression. + fn try_new<'a>( + column_name: impl Into, + op: &Operator, + literals: impl IntoIterator, + ) -> Option { + let guarantee = match op { + Operator::Eq => Guarantee::In, + Operator::NotEq => Guarantee::NotIn, + _ => return None, + }; + + let literals: HashSet<_> = literals.into_iter().cloned().collect(); + + Some(Self { + column: Column::from_name(column_name), + guarantee, + literals, + }) + } + + /// Return a list of `LiteralGuarantees` for the specified predicate that + /// hold if the predicate (filter) expression evaluates to `true`. + /// + /// `expr` should be a boolean expression + /// + /// Note: this function does not simplify the expression prior to analysis. + pub fn analyze(expr: &Arc) -> Vec { + split_conjunction(expr) + .into_iter() + .fold(GuaranteeBuilder::new(), |builder, expr| { + if let Some(cel) = ColOpLit::try_new(expr) { + return builder.aggregate_conjunct(cel); + } else { + // look for (col literal) OR (col literal) ... + let disjunctions = split_disjunction(expr); + + let terms = disjunctions + .iter() + .filter_map(|expr| ColOpLit::try_new(expr)) + .collect::>(); + + if terms.is_empty() { + return builder; + } + + // if not all terms are of the form (col literal), + // can't infer anything + if terms.len() != disjunctions.len() { + return builder; + } + + // if all terms are 'col literal' with the same column + // and operation we can add a guarantee + let first_term = &terms[0]; + if terms.iter().all(|term| { + term.col.name() == first_term.col.name() + && term.op == first_term.op + }) { + builder.aggregate_multi_conjunct( + first_term.col, + first_term.op, + terms.iter().map(|term| term.lit.value()), + ) + } else { + // can't infer anything + builder + } + } + }) + .build() + } +} + +/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s, +/// preserving insert order +#[derive(Debug, Default)] +struct GuaranteeBuilder<'a> { + /// List of guarantees that have been created so far + /// if we have determined a subsequent conjunct invalidates a guarantee + /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None + guarantees: Vec>, + + /// Key is the (column name, operator type) + /// Value is the index into `guarantees` + map: HashMap<(&'a crate::expressions::Column, &'a Operator), usize>, +} + +impl<'a> GuaranteeBuilder<'a> { + fn new() -> Self { + Default::default() + } + + /// Aggregate a new single guarantee to this builder combining with existing guarantees + /// if possible + fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self { + self.aggregate_multi_conjunct( + col_op_lit.col, + col_op_lit.op, + [col_op_lit.lit.value()], + ) + } + + /// Aggreates a new single new guarantee with multiple literals `a IN (1,2,3)` or `a NOT IN (1,2,3)`. So the new values are combined with OR + fn aggregate_multi_conjunct( + mut self, + col: &'a crate::expressions::Column, + op: &'a Operator, + new_values: impl IntoIterator, + ) -> Self { + let key = (col, op); + if let Some(index) = self.map.get(&key) { + // already have a guarantee for this column + let entry = &mut self.guarantees[*index]; + + let Some(existing) = entry else { + // guarantee has been previously invalidated, nothing to do + return self; + }; + + // can only combine conjuncts if we have `a != foo AND a != bar`. + // `a = foo AND a = bar` is not correct. Also, can't extend with more than one value. + match existing.guarantee { + Guarantee::NotIn => { + // can extend if only single literal, otherwise invalidate + let new_values: HashSet<_> = new_values.into_iter().collect(); + if new_values.len() == 1 { + existing.literals.extend(new_values.into_iter().cloned()) + } else { + // this is like (a != foo AND (a != bar OR a != baz)). + // We can't combine the (a!=bar OR a!=baz) part, but it + // also doesn't invalidate a != foo guarantee. + } + } + Guarantee::In => { + // for an IN guarantee, it is ok if the value is the same + // e.g. `a = foo AND a = foo` but not if the value is different + // e.g. `a = foo AND a = bar` + if new_values + .into_iter() + .all(|new_value| existing.literals.contains(new_value)) + { + // all values are already in the set + } else { + // at least one was not, so invalidate the guarantee + *entry = None; + } + } + } + } else { + // This is a new guarantee + let new_values: HashSet<_> = new_values.into_iter().collect(); + + // new_values are combined with OR, so we can only create a + // multi-column guarantee for `=` (or a single value). + // (e.g. ignore `a != foo OR a != bar`) + if op == &Operator::Eq || new_values.len() == 1 { + if let Some(guarantee) = + LiteralGuarantee::try_new(col.name(), op, new_values) + { + // add it to the list of guarantees + self.guarantees.push(Some(guarantee)); + self.map.insert(key, self.guarantees.len() - 1); + } + } + } + + self + } + + /// Return all guarantees that have been created so far + fn build(self) -> Vec { + // filter out any guarantees that have been invalidated + self.guarantees.into_iter().flatten().collect() + } +} + +/// Represents a single `col literal` expression +struct ColOpLit<'a> { + col: &'a crate::expressions::Column, + op: &'a Operator, + lit: &'a crate::expressions::Literal, +} + +impl<'a> ColOpLit<'a> { + /// Returns Some(ColEqLit) if the expression is either: + /// 1. `col literal` + /// 2. `literal col` + /// + /// Returns None otherwise + fn try_new(expr: &'a Arc) -> Option { + let binary_expr = expr + .as_any() + .downcast_ref::()?; + + let (left, op, right) = ( + binary_expr.left().as_any(), + binary_expr.op(), + binary_expr.right().as_any(), + ); + + // col literal + if let (Some(col), Some(lit)) = ( + left.downcast_ref::(), + right.downcast_ref::(), + ) { + Some(Self { col, op, lit }) + } + // literal col + else if let (Some(lit), Some(col)) = ( + left.downcast_ref::(), + right.downcast_ref::(), + ) { + Some(Self { col, op, lit }) + } else { + None + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::create_physical_expr; + use crate::execution_props::ExecutionProps; + use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use datafusion_common::ToDFSchema; + use datafusion_expr::expr_fn::*; + use datafusion_expr::{lit, Expr}; + use std::sync::OnceLock; + + #[test] + fn test_literal() { + // a single literal offers no guarantee + test_analyze(lit(true), vec![]) + } + + #[test] + fn test_single() { + // a = "foo" + test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]); + // "foo" = a + test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]); + // a != "foo" + test_analyze( + col("a").not_eq(lit("foo")), + vec![not_in_guarantee("a", ["foo"])], + ); + // a != "foo" + test_analyze( + lit("foo").not_eq(col("a")), + vec![not_in_guarantee("a", ["foo"])], + ); + } + + #[test] + fn test_conjunction() { + // a = "foo" AND b = 1 + test_analyze( + col("a").eq(lit("foo")).and(col("b").eq(lit(1))), + vec![ + // should find both column guarantees + in_guarantee("a", ["foo"]), + in_guarantee("b", [1]), + ], + ); + // a != "foo" AND b != 1 + test_analyze( + col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))), + // should find both column guarantees + vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])], + ); + // a = "foo" AND a = "bar" + test_analyze( + col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))), + // this predicate is impossible ( can't be both foo and bar), + vec![], + ); + // a = "foo" AND b != "bar" + test_analyze( + col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))), + vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])], + ); + // a != "foo" AND a != "bar" + test_analyze( + col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))), + // know it isn't "foo" or "bar" + vec![not_in_guarantee("a", ["foo", "bar"])], + ); + // a != "foo" AND a != "bar" and a != "baz" + test_analyze( + col("a") + .not_eq(lit("foo")) + .and(col("a").not_eq(lit("bar"))) + .and(col("a").not_eq(lit("baz"))), + // know it isn't "foo" or "bar" or "baz" + vec![not_in_guarantee("a", ["foo", "bar", "baz"])], + ); + // a = "foo" AND a = "foo" + let expr = col("a").eq(lit("foo")); + test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]); + // b > 5 AND b = 10 (should get an b = 10 guarantee) + test_analyze( + col("b").gt(lit(5)).and(col("b").eq(lit(10))), + vec![in_guarantee("b", [10])], + ); + + // a != "foo" and (a != "bar" OR a != "baz") + test_analyze( + col("a") + .not_eq(lit("foo")) + .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))), + // a is not foo (we can't represent other knowledge about a) + vec![not_in_guarantee("a", ["foo"])], + ); + } + + #[test] + fn test_disjunction() { + // a = "foo" OR b = 1 + test_analyze( + col("a").eq(lit("foo")).or(col("b").eq(lit(1))), + // no can't have a single column guarantee (if a = "foo" then b != 1) etc + vec![], + ); + // a != "foo" OR b != 1 + test_analyze( + col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))), + // No single column guarantee + vec![], + ); + // a = "foo" OR a = "bar" + test_analyze( + col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))), + vec![in_guarantee("a", ["foo", "bar"])], + ); + // a = "foo" OR a = "foo" + test_analyze( + col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))), + vec![in_guarantee("a", ["foo"])], + ); + // a != "foo" OR a != "bar" + test_analyze( + col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))), + // can't represent knowledge about a in this case + vec![], + ); + // a = "foo" OR a = "bar" OR a = "baz" + test_analyze( + col("a") + .eq(lit("foo")) + .or(col("a").eq(lit("bar"))) + .or(col("a").eq(lit("baz"))), + vec![in_guarantee("a", ["foo", "bar", "baz"])], + ); + // (a = "foo" OR a = "bar") AND (a = "baz)" + test_analyze( + (col("a").eq(lit("foo")).or(col("a").eq(lit("bar")))) + .and(col("a").eq(lit("baz"))), + // this could potentially be represented as 2 constraints with a more + // sophisticated analysis + vec![], + ); + // (a = "foo" OR a = "bar") AND (b = 1) + test_analyze( + (col("a").eq(lit("foo")).or(col("a").eq(lit("bar")))) + .and(col("b").eq(lit(1))), + vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])], + ); + // (a = "foo" OR a = "bar") OR (b = 1) + test_analyze( + col("a") + .eq(lit("foo")) + .or(col("a").eq(lit("bar"))) + .or(col("b").eq(lit(1))), + // can't represent knowledge about a or b in this case + vec![], + ); + } + + // TODO https://github.com/apache/arrow-datafusion/issues/8436 + // a IN (...) + // b NOT IN (...) + + /// Tests that analyzing expr results in the expected guarantees + fn test_analyze(expr: Expr, expected: Vec) { + println!("Begin analyze of {expr}"); + let schema = schema(); + let physical_expr = logical2physical(&expr, &schema); + + let actual = LiteralGuarantee::analyze(&physical_expr); + assert_eq!( + expected, actual, + "expr: {expr}\ + \n\nexpected: {expected:#?}\ + \n\nactual: {actual:#?}\ + \n\nexpr: {expr:#?}\ + \n\nphysical_expr: {physical_expr:#?}" + ); + } + + /// Guarantee that column is a specified value + fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee + where + I: IntoIterator, + S: Into + 'a, + { + let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); + LiteralGuarantee::try_new(column, &Operator::Eq, literals.iter()).unwrap() + } + + /// Guarantee that column is NOT a specified value + fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee + where + I: IntoIterator, + S: Into + 'a, + { + let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); + LiteralGuarantee::try_new(column, &Operator::NotEq, literals.iter()).unwrap() + } + + /// Convert a logical expression to a physical expression (without any simplification, etc) + fn logical2physical(expr: &Expr, schema: &Schema) -> Arc { + let df_schema = schema.clone().to_dfschema().unwrap(); + let execution_props = ExecutionProps::new(); + create_physical_expr(expr, &df_schema, schema, &execution_props).unwrap() + } + + // Schema for testing + fn schema() -> SchemaRef { + SCHEMA + .get_or_init(|| { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Int32, false), + ])) + }) + .clone() + } + + static SCHEMA: OnceLock = OnceLock::new(); +} diff --git a/datafusion/physical-expr/src/utils.rs b/datafusion/physical-expr/src/utils/mod.rs similarity index 96% rename from datafusion/physical-expr/src/utils.rs rename to datafusion/physical-expr/src/utils/mod.rs index 71a7ff5fb778..87ef36558b96 100644 --- a/datafusion/physical-expr/src/utils.rs +++ b/datafusion/physical-expr/src/utils/mod.rs @@ -15,6 +15,9 @@ // specific language governing permissions and limitations // under the License. +mod guarantee; +pub use guarantee::{Guarantee, LiteralGuarantee}; + use std::borrow::Borrow; use std::collections::{HashMap, HashSet}; use std::sync::Arc; @@ -41,25 +44,29 @@ use petgraph::stable_graph::StableGraph; pub fn split_conjunction( predicate: &Arc, ) -> Vec<&Arc> { - split_conjunction_impl(predicate, vec![]) + split_impl(Operator::And, predicate, vec![]) } -fn split_conjunction_impl<'a>( +/// Assume the predicate is in the form of DNF, split the predicate to a Vec of PhysicalExprs. +/// +/// For example, split "a1 = a2 OR b1 <= b2 OR c1 != c2" into ["a1 = a2", "b1 <= b2", "c1 != c2"] +pub fn split_disjunction( + predicate: &Arc, +) -> Vec<&Arc> { + split_impl(Operator::Or, predicate, vec![]) +} + +fn split_impl<'a>( + operator: Operator, predicate: &'a Arc, mut exprs: Vec<&'a Arc>, ) -> Vec<&'a Arc> { match predicate.as_any().downcast_ref::() { - Some(binary) => match binary.op() { - Operator::And => { - let exprs = split_conjunction_impl(binary.left(), exprs); - split_conjunction_impl(binary.right(), exprs) - } - _ => { - exprs.push(predicate); - exprs - } - }, - None => { + Some(binary) if binary.op() == &operator => { + let exprs = split_impl(operator, binary.left(), exprs); + split_impl(operator, binary.right(), exprs) + } + Some(_) | None => { exprs.push(predicate); exprs } From e1df3fe54accb95f5aad590624bfe16ebce444ec Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 6 Dec 2023 07:52:52 -0500 Subject: [PATCH 02/12] Improve comments --- datafusion/physical-expr/src/utils/guarantee.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 59266f0cc96e..bb5474f19a2b 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -156,8 +156,12 @@ impl<'a> GuaranteeBuilder<'a> { Default::default() } - /// Aggregate a new single guarantee to this builder combining with existing guarantees - /// if possible + /// Aggregate a new single `AND col literal` term to this builder + /// combining with existing guarantees if possible. + /// + /// # Examples + /// * `AND (a = 1)`: `a` is guaranteed to be 1 + /// * `AND (a != 1)`: a is guaranteed to not be 1 fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self { self.aggregate_multi_conjunct( col_op_lit.col, @@ -166,7 +170,14 @@ impl<'a> GuaranteeBuilder<'a> { ) } - /// Aggreates a new single new guarantee with multiple literals `a IN (1,2,3)` or `a NOT IN (1,2,3)`. So the new values are combined with OR + /// Aggregates a new single multi column term to ths builder + /// combining with existing guarantees if possible. + /// + /// # Examples + /// * (`AND (a = 1 OR a = 2 OR a = 3)`): a is guaranteed to be 1, 2, or 3 + /// * `AND (a IN (1,2,3))`: a is guaranteed to be 1, 2, or 3 + /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is guaranteed to not be 1, 2, or 3 + /// * `AND (a NOT IN (1,2,3))`: a is guaranteed to not be 1, 2, or 3 fn aggregate_multi_conjunct( mut self, col: &'a crate::expressions::Column, From 7c8c8f888eccb6c81227dad8815034be556ac3cd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 12 Dec 2023 15:46:33 -0500 Subject: [PATCH 03/12] Improve documentation --- .../physical-expr/src/utils/guarantee.rs | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index bb5474f19a2b..663f00e0769f 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -25,24 +25,29 @@ use datafusion_expr::Operator; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -/// Represents a known guarantee that a column is either one of a set of values -/// or not one of a set of values. +/// Represents a known guarantee that for a particular filter expression to +/// evaluate to `true`, a column must be one of a set of values or not one of +/// a set of values. /// /// `LiteralGuarantee`s can be used to simplify expressions and skip data files -/// (or row groups in parquet files). For example, if we know that `a = 1` then -/// we can skip any partition that does not contain `a = 1`. +/// (or row groups in parquet files) by proving that an expression can not +/// evaluate to true. For example, if we know that `a = 1` must be true for the +/// filter to evaluate to `true`, then we can skip any partition where we know +/// that `a` never has the value of `1`. /// /// See [`LiteralGuarantee::analyze`] to extract literal guarantees from a -/// predicate. +/// filter predicate. /// /// # Details /// A guarantee can be one of two forms: /// -/// 1. One of particular set of values. For example, `(a = 1)`, `(a = 1 OR a = -/// 2) or `a IN (1, 2, 3)` +/// 1. The column must be one of a particular set of values for the predicate +/// to be `true`. For example, +/// `(a = 1)`, `(a = 1 OR a =2) or `a IN (1, 2, 3)` /// -/// 2. NOT one of a particular set of values. For example, `(a != 1)`, `(a != 1 -/// AND a != 2)` or `a NOT IN (1, 2, 3)` +/// The column must NOT one of a particular set of values for the predicate to +/// be `true`. For example, +/// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)` /// #[derive(Debug, Clone, PartialEq)] pub struct LiteralGuarantee { @@ -51,12 +56,12 @@ pub struct LiteralGuarantee { pub literals: HashSet, } -/// What is be guaranteed about the values? +/// What guaranteed about the values if the predicate evaluates to `true`? #[derive(Debug, Clone, PartialEq)] pub enum Guarantee { - /// `column` is one of a set of constant values + /// `column` must be one of a set of constant values In, - /// `column` is NOT one of a set of constant values + /// `column` must NOT be one of a set of constant values NotIn, } From bca390e8ee4d2773ba87fdaf5444469385a74c18 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Wed, 13 Dec 2023 09:55:09 -0500 Subject: [PATCH 04/12] Add more documentation and tests --- .../physical-expr/src/utils/guarantee.rs | 203 ++++++++++++++---- 1 file changed, 166 insertions(+), 37 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 663f00e0769f..32222819174b 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -25,30 +25,50 @@ use datafusion_expr::Operator; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -/// Represents a known guarantee that for a particular filter expression to -/// evaluate to `true`, a column must be one of a set of values or not one of -/// a set of values. +/// Represents a guarantee required for a boolean expression to evaluate to +/// `true`. +/// +/// The guarantee takes the form of a column and a set of literal (constant) +/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column must +/// *satisfy* the guarantee +/// +/// To satisfy the guarantee, depending on [`Guarantee`], the values in the +/// column must either: +/// +/// 1. be ONLY one of that set +/// 2. NOT be ANY of that set +/// +/// # Uses `LiteralGuarantee`s /// /// `LiteralGuarantee`s can be used to simplify expressions and skip data files -/// (or row groups in parquet files) by proving that an expression can not -/// evaluate to true. For example, if we know that `a = 1` must be true for the -/// filter to evaluate to `true`, then we can skip any partition where we know +/// (e.g. row groups in parquet files) by proving expressions can not evaluate +/// to `true`. For example, if we have a guarantee that `a` must be in (`1`) for +/// a filter to evaluate to `true`, then we can skip any partition where we know /// that `a` never has the value of `1`. /// -/// See [`LiteralGuarantee::analyze`] to extract literal guarantees from a +/// **Important**: If a `LiteralGuarantee` is not satisfied the relevant +/// expression is *guaranteed* to evaluate to `false` or `null`. **However**, +/// the opposite does not hold. Even if all `LiteralGurantee`s are satisfied, +/// that does does not guarantee that the predicate will evaluate to `true`: it +/// may still evaluate to `true`, `false` or `null`. +/// +/// # Creating `LiteralGuarantee`s +/// +/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a /// filter predicate. /// /// # Details /// A guarantee can be one of two forms: /// -/// 1. The column must be one of a particular set of values for the predicate -/// to be `true`. For example, -/// `(a = 1)`, `(a = 1 OR a =2) or `a IN (1, 2, 3)` +/// 1. The column must be one a particular set of values for the predicate +/// to be `true`. If the column takes on any other value, the predicate can not +/// evaluate to `true`. For example, +/// `(a = 1)`, `(a = 1 OR a = 2) or `a IN (1, 2, 3)` /// -/// The column must NOT one of a particular set of values for the predicate to -/// be `true`. For example, +/// 2. The column must NOT be one of a particular set of values for the predicate to +/// be `true`. If the column takes on any of the values, the predicate can not +/// evaluate to `true`. For example, /// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)` -/// #[derive(Debug, Clone, PartialEq)] pub struct LiteralGuarantee { pub column: Column, @@ -56,19 +76,22 @@ pub struct LiteralGuarantee { pub literals: HashSet, } -/// What guaranteed about the values if the predicate evaluates to `true`? +/// What is guaranteed about the values for a [`LiteralGuarantee`]? #[derive(Debug, Clone, PartialEq)] pub enum Guarantee { - /// `column` must be one of a set of constant values + /// Guarantee that the expression is `true` if `column` is one the values. If + /// `column` is not one of the values, the expression can not be `true`. In, - /// `column` must NOT be one of a set of constant values + /// Guarantee that the expression is `true` if `column` is not ANY of the + /// values. If `column` is one of the values, the expression can not be + /// `true`. NotIn, } impl LiteralGuarantee { /// Create a new instance of the guarantee if the provided operator is /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to - /// create these structures from an expression. + /// create these structures from an predicate (boolean expression). fn try_new<'a>( column_name: impl Into, op: &Operator, @@ -89,22 +112,45 @@ impl LiteralGuarantee { }) } - /// Return a list of `LiteralGuarantees` for the specified predicate that - /// hold if the predicate (filter) expression evaluates to `true`. + /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr` + /// to evaluate to `true`. /// - /// `expr` should be a boolean expression + /// If more than one `LiteralGuarantee` is returned, they must **all** hold + /// for the expression to possibly be `true`. If any is not satisfied, the + /// expression is guaranteed to be `null` or `false`. /// - /// Note: this function does not simplify the expression prior to analysis. + /// # Notes: + /// 1. `expr` must be a boolean expression. + /// 2. `expr` is not simplified prior to analysis. pub fn analyze(expr: &Arc) -> Vec { + // split conjunction: AND AND ... split_conjunction(expr) .into_iter() + // for an `AND` conjunction to be true, all terms individually must be true .fold(GuaranteeBuilder::new(), |builder, expr| { if let Some(cel) = ColOpLit::try_new(expr) { return builder.aggregate_conjunct(cel); } else { - // look for (col literal) OR (col literal) ... + // split disjunction: OR OR ... let disjunctions = split_disjunction(expr); + // We are trying to add a guarantee that a column must be + // in/not in a particular set of values for the expression + // to evaluate to true. + // + // A disjunction is true, if at least one the terms is be + // true. + // + // Thus, we can infer a guarantee if all terms are of the + // form `(col literal) OR (col literal) OR ...`. + // + // For example, we can infer that `a = 1 OR a = 2 OR a = 3` + // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`). + // + // However, for something like `a = 1 OR a = 2 OR a < 0` we + // **can't** guarantee that the predicate is only true if a + // is in (`1`, `2`), as it could also be true if `a` were less + // than zero. let terms = disjunctions .iter() .filter_map(|expr| ColOpLit::try_new(expr)) @@ -115,13 +161,13 @@ impl LiteralGuarantee { } // if not all terms are of the form (col literal), - // can't infer anything + // can't infer any guarantees if terms.len() != disjunctions.len() { return builder; } // if all terms are 'col literal' with the same column - // and operation we can add a guarantee + // and operation we can infer any guarantees let first_term = &terms[0]; if terms.iter().all(|term| { term.col.name() == first_term.col.name() @@ -175,14 +221,15 @@ impl<'a> GuaranteeBuilder<'a> { ) } - /// Aggregates a new single multi column term to ths builder - /// combining with existing guarantees if possible. + /// Aggregates a new single column, multi literal term to ths builder + /// combining with previously known guarantees if possible. /// /// # Examples - /// * (`AND (a = 1 OR a = 2 OR a = 3)`): a is guaranteed to be 1, 2, or 3 - /// * `AND (a IN (1,2,3))`: a is guaranteed to be 1, 2, or 3 - /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is guaranteed to not be 1, 2, or 3 - /// * `AND (a NOT IN (1,2,3))`: a is guaranteed to not be 1, 2, or 3 + /// For the following examples, we can guarantee the expression is `true` if: + /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3) + /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3) + /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3) + /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3) fn aggregate_multi_conjunct( mut self, col: &'a crate::expressions::Column, @@ -195,13 +242,17 @@ impl<'a> GuaranteeBuilder<'a> { let entry = &mut self.guarantees[*index]; let Some(existing) = entry else { - // guarantee has been previously invalidated, nothing to do + // determined the previous guarantee for this column has been + // invalidated, nothing to do return self; }; - // can only combine conjuncts if we have `a != foo AND a != bar`. - // `a = foo AND a = bar` is not correct. Also, can't extend with more than one value. + // Combine conjuncts if we have `a != foo AND a != bar`. + // `a = foo AND a = bar` doesn't make logical sense, s match existing.guarantee { + // knew that the column could not be a set of values + // For example, if we previously had `a != 5` and now we see another `a != 6` we know + // if Guarantee::NotIn => { // can extend if only single literal, otherwise invalidate let new_values: HashSet<_> = new_values.into_iter().collect(); @@ -335,7 +386,46 @@ mod test { } #[test] - fn test_conjunction() { + fn test_conjunction_single_column() { + // b = 1 AND b = 2 // impossible. Ideally it should be simplifed + test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]); + // b = 1 AND b != 2 + test_analyze(col("b").eq(lit(1)).and(col("b").not_eq(lit(2))), vec![]); + // b != 1 AND b == 2 + test_analyze(col("b").not_eq(lit(1)).and(col("b").eq(lit(2))), vec![]); + // b != 1 AND b != 2 + test_analyze( + col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))), + vec![not_in_guarantee("b", [1, 2])], + ); + // b != 1 AND b != 2 and b != 3 + test_analyze( + col("b") + .not_eq(lit(1)) + .and(col("b").eq(lit(2))) + .and(col("b").not_eq(lit(3))), + vec![not_in_guarantee("b", [1, 2, 3])], + ); + // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3) + test_analyze( + col("b") + .not_eq(lit(1)) + .and(col("b").eq(lit(2))) + .and(col("b").not_eq(lit(3))), + vec![], + ); + // b != 1 AND b != 2 and b > 3 (can't guarantee true if b was neither 1 nor 2 + test_analyze( + col("b") + .not_eq(lit(1)) + .and(col("b").eq(lit(2))) + .and(col("b").lt(lit(3))), + vec![], + ); + } + + #[test] + fn test_conjunction_multi_column() { // a = "foo" AND b = 1 test_analyze( col("a").eq(lit("foo")).and(col("b").eq(lit(1))), @@ -397,7 +487,46 @@ mod test { } #[test] - fn test_disjunction() { + fn test_disjunction_single_column() { + // b = 1 OR b = 2 + test_analyze( + col("b").eq(lit(1)).or(col("b").eq(lit(2))), + vec![in_guarantee("b", [1, 2])], + ); + // b != 1 OR b = 2 + test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]); + // b = 1 OR b != 2 + test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]); + // b != 1 OR b != 2 + test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]); + // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3 + test_analyze( + col("b") + .not_eq(lit(1)) + .or(col("b").not_eq(lit(2))) + .or(lit("b").eq(lit(3))), + vec![], + ); + // b = 1 OR b = 2 OR b = 3 + test_analyze( + col("b") + .eq(lit(1)) + .or(col("b").eq(lit(2))) + .or(lit("b").eq(lit(3))), + vec![in_guarantee("b", [1, 2, 3])], + ); + // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2) + test_analyze( + col("b") + .eq(lit(1)) + .or(col("b").eq(lit(2))) + .or(lit("b").eq(lit(3))), + vec![], + ); + } + + #[test] + fn test_disjunction_multi_column() { // a = "foo" OR b = 1 test_analyze( col("a").eq(lit("foo")).or(col("b").eq(lit(1))), @@ -480,7 +609,7 @@ mod test { ); } - /// Guarantee that column is a specified value + /// Guarantee that the expression is true if the column is one of the specified values fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee where I: IntoIterator, @@ -490,7 +619,7 @@ mod test { LiteralGuarantee::try_new(column, &Operator::Eq, literals.iter()).unwrap() } - /// Guarantee that column is NOT a specified value + /// Guarantee that the expression is true if the column is NOT any of the specified values fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee where I: IntoIterator, From d73136b345942cb3a724ff2b5e2a3f173e3d6bf9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 08:54:12 -0500 Subject: [PATCH 05/12] refine documentation and tests --- .../physical-expr/src/utils/guarantee.rs | 95 ++++++++++++++----- 1 file changed, 71 insertions(+), 24 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 32222819174b..af3d172a39c6 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -25,12 +25,12 @@ use datafusion_expr::Operator; use std::collections::{HashMap, HashSet}; use std::sync::Arc; -/// Represents a guarantee required for a boolean expression to evaluate to -/// `true`. +/// Represents a guarantee that must be true for a boolean expression to +/// evaluate to `true`. /// /// The guarantee takes the form of a column and a set of literal (constant) -/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column must -/// *satisfy* the guarantee +/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must +/// satisfy* the guarantee(s). /// /// To satisfy the guarantee, depending on [`Guarantee`], the values in the /// column must either: @@ -40,17 +40,17 @@ use std::sync::Arc; /// /// # Uses `LiteralGuarantee`s /// -/// `LiteralGuarantee`s can be used to simplify expressions and skip data files -/// (e.g. row groups in parquet files) by proving expressions can not evaluate -/// to `true`. For example, if we have a guarantee that `a` must be in (`1`) for -/// a filter to evaluate to `true`, then we can skip any partition where we know -/// that `a` never has the value of `1`. +/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data +/// files (e.g. row groups in parquet files) by proving expressions can not +/// possibly evaluate to `true`. For example, if we have a guarantee that `a` +/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any +/// partition where we know that `a` never has the value of `1`. /// -/// **Important**: If a `LiteralGuarantee` is not satisfied the relevant +/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant /// expression is *guaranteed* to evaluate to `false` or `null`. **However**, /// the opposite does not hold. Even if all `LiteralGurantee`s are satisfied, -/// that does does not guarantee that the predicate will evaluate to `true`: it -/// may still evaluate to `true`, `false` or `null`. +/// that does does not guarantee that the predicate will actually evaluate to +/// `true`: it may still evaluate to `true`, `false` or `null`. /// /// # Creating `LiteralGuarantee`s /// @@ -387,12 +387,26 @@ mod test { #[test] fn test_conjunction_single_column() { - // b = 1 AND b = 2 // impossible. Ideally it should be simplifed + // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]); - // b = 1 AND b != 2 - test_analyze(col("b").eq(lit(1)).and(col("b").not_eq(lit(2))), vec![]); - // b != 1 AND b == 2 - test_analyze(col("b").not_eq(lit(1)).and(col("b").eq(lit(2))), vec![]); + // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`. + test_analyze( + col("b").eq(lit(1)).and(col("b").not_eq(lit(2))), + vec![ + // can only be true of b is 1 and b is not 2 (even though it is redundant) + in_guarantee("b", [1]), + not_in_guarantee("b", [2]), + ], + ); + // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`. + test_analyze( + col("b").not_eq(lit(1)).and(col("b").eq(lit(2))), + vec![ + // can only be true of b is not 1 and b is is 2 (even though it is redundant) + not_in_guarantee("b", [1]), + in_guarantee("b", [2]), + ], + ); // b != 1 AND b != 2 test_analyze( col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))), @@ -402,25 +416,33 @@ mod test { test_analyze( col("b") .not_eq(lit(1)) - .and(col("b").eq(lit(2))) + .and(col("b").not_eq(lit(2))) .and(col("b").not_eq(lit(3))), vec![not_in_guarantee("b", [1, 2, 3])], ); - // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3) + // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3) test_analyze( col("b") .not_eq(lit(1)) .and(col("b").eq(lit(2))) .and(col("b").not_eq(lit(3))), - vec![], + vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])], ); - // b != 1 AND b != 2 and b > 3 (can't guarantee true if b was neither 1 nor 2 + // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3) test_analyze( col("b") .not_eq(lit(1)) - .and(col("b").eq(lit(2))) + .and(col("b").not_eq(lit(2))) + .and(col("b").eq(lit(3))), + vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])], + ); + // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2 + test_analyze( + col("b") + .not_eq(lit(1)) + .and(col("b").not_eq(lit(2))) .and(col("b").lt(lit(3))), - vec![], + vec![not_in_guarantee("b", [1, 2])], ); } @@ -486,6 +508,31 @@ mod test { ); } + #[test] + fn test_conjunction_and_disjunction_single_column() { + // b != 1 AND (b > 2) + test_analyze( + col("b") + .not_eq(lit(1)) + .and(col("b").gt(lit(2)).or(col("b").eq(lit(3)))), + vec![ + // for the expression to be true, b can not be one + not_in_guarantee("b", [1]), + ], + ); + + // b = 1 AND (b = 2 OR b = 3). Could be simplified to false. + test_analyze( + col("b") + .eq(lit(1)) + .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))), + vec![ + // in theory, b must be 1 and one of 2,3 for this expression to be true + // which is a logical contradiction + ], + ); + } + #[test] fn test_disjunction_single_column() { // b = 1 OR b = 2 @@ -512,7 +559,7 @@ mod test { col("b") .eq(lit(1)) .or(col("b").eq(lit(2))) - .or(lit("b").eq(lit(3))), + .or(col("b").eq(lit(3))), vec![in_guarantee("b", [1, 2, 3])], ); // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2) From 941b0f319f3662e1db5b4e0dae2cb31e268f7d5a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 13:24:04 -0500 Subject: [PATCH 06/12] Apply suggestions from code review Co-authored-by: Nga Tran --- datafusion/physical-expr/src/utils/guarantee.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index af3d172a39c6..c3b317c498ea 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -79,7 +79,7 @@ pub struct LiteralGuarantee { /// What is guaranteed about the values for a [`LiteralGuarantee`]? #[derive(Debug, Clone, PartialEq)] pub enum Guarantee { - /// Guarantee that the expression is `true` if `column` is one the values. If + /// Guarantee that the expression is `true` if `column` is one of the values. If /// `column` is not one of the values, the expression can not be `true`. In, /// Guarantee that the expression is `true` if `column` is not ANY of the @@ -138,7 +138,7 @@ impl LiteralGuarantee { // in/not in a particular set of values for the expression // to evaluate to true. // - // A disjunction is true, if at least one the terms is be + // A disjunction is true, if at least one of the terms is be // true. // // Thus, we can infer a guarantee if all terms are of the @@ -378,7 +378,7 @@ mod test { col("a").not_eq(lit("foo")), vec![not_in_guarantee("a", ["foo"])], ); - // a != "foo" + // "foo" != a test_analyze( lit("foo").not_eq(col("a")), vec![not_in_guarantee("a", ["foo"])], @@ -441,7 +441,7 @@ mod test { col("b") .not_eq(lit(1)) .and(col("b").not_eq(lit(2))) - .and(col("b").lt(lit(3))), + .and(col("b").gt(lit(3))), vec![not_in_guarantee("b", [1, 2])], ); } From 8da221f18f5fbeef58e87e6742d027d675e9cb38 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 13:27:59 -0500 Subject: [PATCH 07/12] Fix half comment --- datafusion/physical-expr/src/utils/guarantee.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index c3b317c498ea..7672becc5af6 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -247,12 +247,15 @@ impl<'a> GuaranteeBuilder<'a> { return self; }; - // Combine conjuncts if we have `a != foo AND a != bar`. - // `a = foo AND a = bar` doesn't make logical sense, s + // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo + // AND a = bar` doesn't make logical sense so we don't optimize this + // case match existing.guarantee { // knew that the column could not be a set of values - // For example, if we previously had `a != 5` and now we see another `a != 6` we know - // if + // + // For example, if we previously had `a != 5` and now we see + // another `AND a != 6` we know that a must not be either 5 or 6 + // for the expression to be true Guarantee::NotIn => { // can extend if only single literal, otherwise invalidate let new_values: HashSet<_> = new_values.into_iter().collect(); @@ -260,8 +263,9 @@ impl<'a> GuaranteeBuilder<'a> { existing.literals.extend(new_values.into_iter().cloned()) } else { // this is like (a != foo AND (a != bar OR a != baz)). - // We can't combine the (a!=bar OR a!=baz) part, but it - // also doesn't invalidate a != foo guarantee. + // We can't combine the (a != bar OR a != baz) part, but + // it also doesn't invalidate our knowledge that a != + // foo is required for the expression to be true } } Guarantee::In => { From af13798bbff601256e0ee398ef882ec7196a60db Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 13:36:24 -0500 Subject: [PATCH 08/12] swap operators before analysis --- .../physical-expr/src/utils/guarantee.rs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 7672becc5af6..56d6936d1059 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -94,7 +94,7 @@ impl LiteralGuarantee { /// create these structures from an predicate (boolean expression). fn try_new<'a>( column_name: impl Into, - op: &Operator, + op: Operator, literals: impl IntoIterator, ) -> Option { let guarantee = match op { @@ -199,7 +199,7 @@ struct GuaranteeBuilder<'a> { /// Key is the (column name, operator type) /// Value is the index into `guarantees` - map: HashMap<(&'a crate::expressions::Column, &'a Operator), usize>, + map: HashMap<(&'a crate::expressions::Column, Operator), usize>, } impl<'a> GuaranteeBuilder<'a> { @@ -233,7 +233,7 @@ impl<'a> GuaranteeBuilder<'a> { fn aggregate_multi_conjunct( mut self, col: &'a crate::expressions::Column, - op: &'a Operator, + op: Operator, new_values: impl IntoIterator, ) -> Self { let key = (col, op); @@ -290,7 +290,7 @@ impl<'a> GuaranteeBuilder<'a> { // new_values are combined with OR, so we can only create a // multi-column guarantee for `=` (or a single value). // (e.g. ignore `a != foo OR a != bar`) - if op == &Operator::Eq || new_values.len() == 1 { + if op == Operator::Eq || new_values.len() == 1 { if let Some(guarantee) = LiteralGuarantee::try_new(col.name(), op, new_values) { @@ -314,7 +314,7 @@ impl<'a> GuaranteeBuilder<'a> { /// Represents a single `col literal` expression struct ColOpLit<'a> { col: &'a crate::expressions::Column, - op: &'a Operator, + op: Operator, lit: &'a crate::expressions::Literal, } @@ -340,14 +340,15 @@ impl<'a> ColOpLit<'a> { left.downcast_ref::(), right.downcast_ref::(), ) { - Some(Self { col, op, lit }) + Some(Self { col, op: *op, lit }) } // literal col else if let (Some(lit), Some(col)) = ( left.downcast_ref::(), right.downcast_ref::(), ) { - Some(Self { col, op, lit }) + // Used swapped operator operator, if possible + op.swap().map(|op| Self { col, op, lit }) } else { None } @@ -667,7 +668,7 @@ mod test { S: Into + 'a, { let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); - LiteralGuarantee::try_new(column, &Operator::Eq, literals.iter()).unwrap() + LiteralGuarantee::try_new(column, Operator::Eq, literals.iter()).unwrap() } /// Guarantee that the expression is true if the column is NOT any of the specified values @@ -677,7 +678,7 @@ mod test { S: Into + 'a, { let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); - LiteralGuarantee::try_new(column, &Operator::NotEq, literals.iter()).unwrap() + LiteralGuarantee::try_new(column, Operator::NotEq, literals.iter()).unwrap() } /// Convert a logical expression to a physical expression (without any simplification, etc) From 4ce193c65456b33ecd876d73afb1a0600ea2fac6 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 13:49:27 -0500 Subject: [PATCH 09/12] More tests --- datafusion/physical-expr/src/utils/guarantee.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 56d6936d1059..5f9633f984f9 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -502,7 +502,14 @@ mod test { col("b").gt(lit(5)).and(col("b").eq(lit(10))), vec![in_guarantee("b", [10])], ); - + // b > 10 AND b = 10 (this is impossible) + test_analyze( + col("b").gt(lit(10)).and(col("b").eq(lit(10))), + vec![ + // if b isn't 10, it can not be true (though the expression actually can never be true) + in_guarantee("b", [10]) + ], + ); // a != "foo" and (a != "bar" OR a != "baz") test_analyze( col("a") @@ -519,7 +526,7 @@ mod test { test_analyze( col("b") .not_eq(lit(1)) - .and(col("b").gt(lit(2)).or(col("b").eq(lit(3)))), + .and(col("b").gt(lit(2))), vec![ // for the expression to be true, b can not be one not_in_guarantee("b", [1]), From 27f5c36f702bbf26b699c0ab4da1d8888da7f130 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 15 Dec 2023 14:21:20 -0500 Subject: [PATCH 10/12] cmt --- datafusion/physical-expr/src/utils/guarantee.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 5f9633f984f9..3969346f2b55 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -507,7 +507,7 @@ mod test { col("b").gt(lit(10)).and(col("b").eq(lit(10))), vec![ // if b isn't 10, it can not be true (though the expression actually can never be true) - in_guarantee("b", [10]) + in_guarantee("b", [10]), ], ); // a != "foo" and (a != "bar" OR a != "baz") @@ -524,9 +524,7 @@ mod test { fn test_conjunction_and_disjunction_single_column() { // b != 1 AND (b > 2) test_analyze( - col("b") - .not_eq(lit(1)) - .and(col("b").gt(lit(2))), + col("b").not_eq(lit(1)).and(col("b").gt(lit(2))), vec![ // for the expression to be true, b can not be one not_in_guarantee("b", [1]), From cfb08cc9f1c68fb0864b0045941bb3471c42048d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 16 Dec 2023 08:07:00 -0500 Subject: [PATCH 11/12] Apply suggestions from code review Co-authored-by: Ruihang Xia --- datafusion/physical-expr/src/utils/guarantee.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 3969346f2b55..afe61fd63d58 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -49,7 +49,7 @@ use std::sync::Arc; /// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant /// expression is *guaranteed* to evaluate to `false` or `null`. **However**, /// the opposite does not hold. Even if all `LiteralGurantee`s are satisfied, -/// that does does not guarantee that the predicate will actually evaluate to +/// that does not guarantee that the predicate will actually evaluate to /// `true`: it may still evaluate to `true`, `false` or `null`. /// /// # Creating `LiteralGuarantee`s @@ -60,7 +60,7 @@ use std::sync::Arc; /// # Details /// A guarantee can be one of two forms: /// -/// 1. The column must be one a particular set of values for the predicate +/// 1. The column must be one of a particular set of values for the predicate /// to be `true`. If the column takes on any other value, the predicate can not /// evaluate to `true`. For example, /// `(a = 1)`, `(a = 1 OR a = 2) or `a IN (1, 2, 3)` From f65a1ddd98bbbe29ff48cd1e462d4a12ee7a02ee Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 16 Dec 2023 08:12:11 -0500 Subject: [PATCH 12/12] refine comments more --- .../physical-expr/src/utils/guarantee.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 3969346f2b55..cb03d3bc3745 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -32,7 +32,7 @@ use std::sync::Arc; /// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must /// satisfy* the guarantee(s). /// -/// To satisfy the guarantee, depending on [`Guarantee`], the values in the +/// To satisfy the guarantee, depending on [`Guarantee`], the values in the /// column must either: /// /// 1. be ONLY one of that set @@ -49,7 +49,7 @@ use std::sync::Arc; /// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant /// expression is *guaranteed* to evaluate to `false` or `null`. **However**, /// the opposite does not hold. Even if all `LiteralGurantee`s are satisfied, -/// that does does not guarantee that the predicate will actually evaluate to +/// that does does **not** guarantee that the predicate will actually evaluate to /// `true`: it may still evaluate to `true`, `false` or `null`. /// /// # Creating `LiteralGuarantee`s @@ -60,13 +60,13 @@ use std::sync::Arc; /// # Details /// A guarantee can be one of two forms: /// -/// 1. The column must be one a particular set of values for the predicate -/// to be `true`. If the column takes on any other value, the predicate can not -/// evaluate to `true`. For example, +/// 1. The column must be one the values for the predicate to be `true`. If the +/// column takes on any other value, the predicate can not evaluate to `true`. +/// For example, /// `(a = 1)`, `(a = 1 OR a = 2) or `a IN (1, 2, 3)` /// -/// 2. The column must NOT be one of a particular set of values for the predicate to -/// be `true`. If the column takes on any of the values, the predicate can not +/// 2. The column must NOT be one of the values for the predicate to be `true`. +/// If the column can ONLY take one of these values, the predicate can not /// evaluate to `true`. For example, /// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)` #[derive(Debug, Clone, PartialEq)] @@ -83,8 +83,8 @@ pub enum Guarantee { /// `column` is not one of the values, the expression can not be `true`. In, /// Guarantee that the expression is `true` if `column` is not ANY of the - /// values. If `column` is one of the values, the expression can not be - /// `true`. + /// values. If `column` only takes one of these values, the expression can + /// not be `true`. NotIn, }