/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/utils/guarantee.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! [`LiteralGuarantee`] predicate analysis to determine if a column is a |
19 | | //! constant. |
20 | | |
21 | | use crate::utils::split_disjunction; |
22 | | use crate::{split_conjunction, PhysicalExpr}; |
23 | | use datafusion_common::{Column, ScalarValue}; |
24 | | use datafusion_expr::Operator; |
25 | | use std::collections::{HashMap, HashSet}; |
26 | | use std::fmt::{self, Display, Formatter}; |
27 | | use std::sync::Arc; |
28 | | |
29 | | /// Represents a guarantee that must be true for a boolean expression to |
30 | | /// evaluate to `true`. |
31 | | /// |
32 | | /// The guarantee takes the form of a column and a set of literal (constant) |
33 | | /// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must |
34 | | /// satisfy* the guarantee(s). |
35 | | /// |
36 | | /// To satisfy the guarantee, depending on [`Guarantee`], the values in the |
37 | | /// column must either: |
38 | | /// |
39 | | /// 1. be ONLY one of that set |
40 | | /// 2. NOT be ANY of that set |
41 | | /// |
42 | | /// # Uses `LiteralGuarantee`s |
43 | | /// |
44 | | /// `LiteralGuarantee`s can be used to simplify filter expressions and skip data |
45 | | /// files (e.g. row groups in parquet files) by proving expressions can not |
46 | | /// possibly evaluate to `true`. For example, if we have a guarantee that `a` |
47 | | /// must be in (`1`) for a filter to evaluate to `true`, then we can skip any |
48 | | /// partition where we know that `a` never has the value of `1`. |
49 | | /// |
50 | | /// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant |
51 | | /// expression is *guaranteed* to evaluate to `false` or `null`. **However**, |
52 | | /// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied, |
53 | | /// that does **not** guarantee that the predicate will actually evaluate to |
54 | | /// `true`: it may still evaluate to `true`, `false` or `null`. |
55 | | /// |
56 | | /// # Creating `LiteralGuarantee`s |
57 | | /// |
58 | | /// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a |
59 | | /// filter predicate. |
60 | | /// |
61 | | /// # Details |
62 | | /// A guarantee can be one of two forms: |
63 | | /// |
64 | | /// 1. The column must be one the values for the predicate to be `true`. If the |
65 | | /// column takes on any other value, the predicate can not evaluate to `true`. |
66 | | /// For example, |
67 | | /// `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)` |
68 | | /// |
69 | | /// 2. The column must NOT be one of the values for the predicate to be `true`. |
70 | | /// If the column can ONLY take one of these values, the predicate can not |
71 | | /// evaluate to `true`. For example, |
72 | | /// `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)` |
73 | | #[derive(Debug, Clone, PartialEq)] |
74 | | pub struct LiteralGuarantee { |
75 | | pub column: Column, |
76 | | pub guarantee: Guarantee, |
77 | | pub literals: HashSet<ScalarValue>, |
78 | | } |
79 | | |
80 | | /// What is guaranteed about the values for a [`LiteralGuarantee`]? |
81 | | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |
82 | | pub enum Guarantee { |
83 | | /// Guarantee that the expression is `true` if `column` is one of the values. If |
84 | | /// `column` is not one of the values, the expression can not be `true`. |
85 | | In, |
86 | | /// Guarantee that the expression is `true` if `column` is not ANY of the |
87 | | /// values. If `column` only takes one of these values, the expression can |
88 | | /// not be `true`. |
89 | | NotIn, |
90 | | } |
91 | | |
92 | | impl LiteralGuarantee { |
93 | | /// Create a new instance of the guarantee if the provided operator is |
94 | | /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to |
95 | | /// create these structures from an predicate (boolean expression). |
96 | 0 | fn new<'a>( |
97 | 0 | column_name: impl Into<String>, |
98 | 0 | guarantee: Guarantee, |
99 | 0 | literals: impl IntoIterator<Item = &'a ScalarValue>, |
100 | 0 | ) -> Self { |
101 | 0 | let literals: HashSet<_> = literals.into_iter().cloned().collect(); |
102 | 0 |
|
103 | 0 | Self { |
104 | 0 | column: Column::from_name(column_name), |
105 | 0 | guarantee, |
106 | 0 | literals, |
107 | 0 | } |
108 | 0 | } |
109 | | |
110 | | /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr` |
111 | | /// to evaluate to `true`. |
112 | | /// |
113 | | /// If more than one `LiteralGuarantee` is returned, they must **all** hold |
114 | | /// for the expression to possibly be `true`. If any is not satisfied, the |
115 | | /// expression is guaranteed to be `null` or `false`. |
116 | | /// |
117 | | /// # Notes: |
118 | | /// 1. `expr` must be a boolean expression or inlist expression. |
119 | | /// 2. `expr` is not simplified prior to analysis. |
120 | 0 | pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> { |
121 | 0 | // split conjunction: <expr> AND <expr> AND ... |
122 | 0 | split_conjunction(expr) |
123 | 0 | .into_iter() |
124 | 0 | // for an `AND` conjunction to be true, all terms individually must be true |
125 | 0 | .fold(GuaranteeBuilder::new(), |builder, expr| { |
126 | 0 | if let Some(cel) = ColOpLit::try_new(expr) { |
127 | 0 | return builder.aggregate_conjunct(cel); |
128 | 0 | } else if let Some(inlist) = expr |
129 | 0 | .as_any() |
130 | 0 | .downcast_ref::<crate::expressions::InListExpr>() |
131 | | { |
132 | | // Only support single-column inlist currently, multi-column inlist is not supported |
133 | 0 | let col = inlist |
134 | 0 | .expr() |
135 | 0 | .as_any() |
136 | 0 | .downcast_ref::<crate::expressions::Column>(); |
137 | 0 | let Some(col) = col else { |
138 | 0 | return builder; |
139 | | }; |
140 | | |
141 | 0 | let literals = inlist |
142 | 0 | .list() |
143 | 0 | .iter() |
144 | 0 | .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>()) |
145 | 0 | .collect::<Option<Vec<_>>>(); |
146 | 0 | let Some(literals) = literals else { |
147 | 0 | return builder; |
148 | | }; |
149 | | |
150 | 0 | let guarantee = if inlist.negated() { |
151 | 0 | Guarantee::NotIn |
152 | | } else { |
153 | 0 | Guarantee::In |
154 | | }; |
155 | | |
156 | 0 | builder.aggregate_multi_conjunct( |
157 | 0 | col, |
158 | 0 | guarantee, |
159 | 0 | literals.iter().map(|e| e.value()), |
160 | 0 | ) |
161 | | } else { |
162 | | // split disjunction: <expr> OR <expr> OR ... |
163 | 0 | let disjunctions = split_disjunction(expr); |
164 | 0 |
|
165 | 0 | // We are trying to add a guarantee that a column must be |
166 | 0 | // in/not in a particular set of values for the expression |
167 | 0 | // to evaluate to true. |
168 | 0 | // |
169 | 0 | // A disjunction is true, if at least one of the terms is be |
170 | 0 | // true. |
171 | 0 | // |
172 | 0 | // Thus, we can infer a guarantee if all terms are of the |
173 | 0 | // form `(col <op> literal) OR (col <op> literal) OR ...`. |
174 | 0 | // |
175 | 0 | // For example, we can infer that `a = 1 OR a = 2 OR a = 3` |
176 | 0 | // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`). |
177 | 0 | // |
178 | 0 | // However, for something like `a = 1 OR a = 2 OR a < 0` we |
179 | 0 | // **can't** guarantee that the predicate is only true if a |
180 | 0 | // is in (`1`, `2`), as it could also be true if `a` were less |
181 | 0 | // than zero. |
182 | 0 | let terms = disjunctions |
183 | 0 | .iter() |
184 | 0 | .filter_map(|expr| ColOpLit::try_new(expr)) |
185 | 0 | .collect::<Vec<_>>(); |
186 | 0 |
|
187 | 0 | if terms.is_empty() { |
188 | 0 | return builder; |
189 | 0 | } |
190 | 0 |
|
191 | 0 | // if not all terms are of the form (col <op> literal), |
192 | 0 | // can't infer any guarantees |
193 | 0 | if terms.len() != disjunctions.len() { |
194 | 0 | return builder; |
195 | 0 | } |
196 | 0 |
|
197 | 0 | // if all terms are 'col <op> literal' with the same column |
198 | 0 | // and operation we can infer any guarantees |
199 | 0 | // |
200 | 0 | // For those like (a != foo AND (a != bar OR a != baz)). |
201 | 0 | // We can't combine the (a != bar OR a != baz) part, but |
202 | 0 | // it also doesn't invalidate our knowledge that a != |
203 | 0 | // foo is required for the expression to be true. |
204 | 0 | // So we can only create a multi value guarantee for `=` |
205 | 0 | // (or a single value). (e.g. ignore `a != foo OR a != bar`) |
206 | 0 | let first_term = &terms[0]; |
207 | 0 | if terms.iter().all(|term| { |
208 | 0 | term.col.name() == first_term.col.name() |
209 | 0 | && term.guarantee == Guarantee::In |
210 | 0 | }) { |
211 | 0 | builder.aggregate_multi_conjunct( |
212 | 0 | first_term.col, |
213 | 0 | Guarantee::In, |
214 | 0 | terms.iter().map(|term| term.lit.value()), |
215 | 0 | ) |
216 | | } else { |
217 | | // can't infer anything |
218 | 0 | builder |
219 | | } |
220 | | } |
221 | 0 | }) |
222 | 0 | .build() |
223 | 0 | } |
224 | | } |
225 | | |
226 | | impl Display for LiteralGuarantee { |
227 | 0 | fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { |
228 | 0 | let mut sorted_literals: Vec<_> = |
229 | 0 | self.literals.iter().map(|lit| lit.to_string()).collect(); |
230 | 0 | sorted_literals.sort(); |
231 | 0 | match self.guarantee { |
232 | 0 | Guarantee::In => write!( |
233 | 0 | f, |
234 | 0 | "{} in ({})", |
235 | 0 | self.column.name, |
236 | 0 | sorted_literals.join(", ") |
237 | 0 | ), |
238 | 0 | Guarantee::NotIn => write!( |
239 | 0 | f, |
240 | 0 | "{} not in ({})", |
241 | 0 | self.column.name, |
242 | 0 | sorted_literals.join(", ") |
243 | 0 | ), |
244 | | } |
245 | 0 | } |
246 | | } |
247 | | |
248 | | /// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s, |
249 | | /// preserving insert order |
250 | | #[derive(Debug, Default)] |
251 | | struct GuaranteeBuilder<'a> { |
252 | | /// List of guarantees that have been created so far |
253 | | /// if we have determined a subsequent conjunct invalidates a guarantee |
254 | | /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None |
255 | | guarantees: Vec<Option<LiteralGuarantee>>, |
256 | | |
257 | | /// Key is the (column name, guarantee type) |
258 | | /// Value is the index into `guarantees` |
259 | | map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>, |
260 | | } |
261 | | |
262 | | impl<'a> GuaranteeBuilder<'a> { |
263 | 0 | fn new() -> Self { |
264 | 0 | Default::default() |
265 | 0 | } |
266 | | |
267 | | /// Aggregate a new single `AND col <op> literal` term to this builder |
268 | | /// combining with existing guarantees if possible. |
269 | | /// |
270 | | /// # Examples |
271 | | /// * `AND (a = 1)`: `a` is guaranteed to be 1 |
272 | | /// * `AND (a != 1)`: a is guaranteed to not be 1 |
273 | 0 | fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self { |
274 | 0 | self.aggregate_multi_conjunct( |
275 | 0 | col_op_lit.col, |
276 | 0 | col_op_lit.guarantee, |
277 | 0 | [col_op_lit.lit.value()], |
278 | 0 | ) |
279 | 0 | } |
280 | | |
281 | | /// Aggregates a new single column, multi literal term to this builder |
282 | | /// combining with previously known guarantees if possible. |
283 | | /// |
284 | | /// # Examples |
285 | | /// For the following examples, we can guarantee the expression is `true` if: |
286 | | /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3) |
287 | | /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3) |
288 | | /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3) |
289 | | /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3) |
290 | 0 | fn aggregate_multi_conjunct( |
291 | 0 | mut self, |
292 | 0 | col: &'a crate::expressions::Column, |
293 | 0 | guarantee: Guarantee, |
294 | 0 | new_values: impl IntoIterator<Item = &'a ScalarValue>, |
295 | 0 | ) -> Self { |
296 | 0 | let key = (col, guarantee); |
297 | 0 | if let Some(index) = self.map.get(&key) { |
298 | | // already have a guarantee for this column |
299 | 0 | let entry = &mut self.guarantees[*index]; |
300 | | |
301 | 0 | let Some(existing) = entry else { |
302 | | // determined the previous guarantee for this column has been |
303 | | // invalidated, nothing to do |
304 | 0 | return self; |
305 | | }; |
306 | | |
307 | | // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo |
308 | | // AND a = bar` doesn't make logical sense so we don't optimize this |
309 | | // case |
310 | 0 | match existing.guarantee { |
311 | | // knew that the column could not be a set of values |
312 | | // |
313 | | // For example, if we previously had `a != 5` and now we see |
314 | | // another `AND a != 6` we know that a must not be either 5 or 6 |
315 | | // for the expression to be true |
316 | 0 | Guarantee::NotIn => { |
317 | 0 | let new_values: HashSet<_> = new_values.into_iter().collect(); |
318 | 0 | existing.literals.extend(new_values.into_iter().cloned()); |
319 | 0 | } |
320 | | Guarantee::In => { |
321 | 0 | let intersection = new_values |
322 | 0 | .into_iter() |
323 | 0 | .filter(|new_value| existing.literals.contains(*new_value)) |
324 | 0 | .collect::<Vec<_>>(); |
325 | 0 | // for an In guarantee, if the intersection is not empty, we can extend the guarantee |
326 | 0 | // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)` |
327 | 0 | // otherwise, we invalidate the guarantee |
328 | 0 | // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid |
329 | 0 | if !intersection.is_empty() { |
330 | 0 | existing.literals = intersection.into_iter().cloned().collect(); |
331 | 0 | } else { |
332 | 0 | // at least one was not, so invalidate the guarantee |
333 | 0 | *entry = None; |
334 | 0 | } |
335 | | } |
336 | | } |
337 | 0 | } else { |
338 | 0 | // This is a new guarantee |
339 | 0 | let new_values: HashSet<_> = new_values.into_iter().collect(); |
340 | 0 |
|
341 | 0 | let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values); |
342 | 0 | // add it to the list of guarantees |
343 | 0 | self.guarantees.push(Some(guarantee)); |
344 | 0 | self.map.insert(key, self.guarantees.len() - 1); |
345 | 0 | } |
346 | | |
347 | 0 | self |
348 | 0 | } |
349 | | |
350 | | /// Return all guarantees that have been created so far |
351 | 0 | fn build(self) -> Vec<LiteralGuarantee> { |
352 | 0 | // filter out any guarantees that have been invalidated |
353 | 0 | self.guarantees.into_iter().flatten().collect() |
354 | 0 | } |
355 | | } |
356 | | |
357 | | /// Represents a single `col [not]in literal` expression |
358 | | struct ColOpLit<'a> { |
359 | | col: &'a crate::expressions::Column, |
360 | | guarantee: Guarantee, |
361 | | lit: &'a crate::expressions::Literal, |
362 | | } |
363 | | |
364 | | impl<'a> ColOpLit<'a> { |
365 | | /// Returns Some(ColEqLit) if the expression is either: |
366 | | /// 1. `col <op> literal` |
367 | | /// 2. `literal <op> col` |
368 | | /// 3. operator is `=` or `!=` |
369 | | /// |
370 | | /// Returns None otherwise |
371 | 0 | fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> { |
372 | 0 | let binary_expr = expr |
373 | 0 | .as_any() |
374 | 0 | .downcast_ref::<crate::expressions::BinaryExpr>()?; |
375 | | |
376 | 0 | let (left, op, right) = ( |
377 | 0 | binary_expr.left().as_any(), |
378 | 0 | binary_expr.op(), |
379 | 0 | binary_expr.right().as_any(), |
380 | 0 | ); |
381 | 0 | let guarantee = match op { |
382 | 0 | Operator::Eq => Guarantee::In, |
383 | 0 | Operator::NotEq => Guarantee::NotIn, |
384 | 0 | _ => return None, |
385 | | }; |
386 | | // col <op> literal |
387 | 0 | if let (Some(col), Some(lit)) = ( |
388 | 0 | left.downcast_ref::<crate::expressions::Column>(), |
389 | 0 | right.downcast_ref::<crate::expressions::Literal>(), |
390 | | ) { |
391 | 0 | Some(Self { |
392 | 0 | col, |
393 | 0 | guarantee, |
394 | 0 | lit, |
395 | 0 | }) |
396 | | } |
397 | | // literal <op> col |
398 | 0 | else if let (Some(lit), Some(col)) = ( |
399 | 0 | left.downcast_ref::<crate::expressions::Literal>(), |
400 | 0 | right.downcast_ref::<crate::expressions::Column>(), |
401 | | ) { |
402 | 0 | Some(Self { |
403 | 0 | col, |
404 | 0 | guarantee, |
405 | 0 | lit, |
406 | 0 | }) |
407 | | } else { |
408 | 0 | None |
409 | | } |
410 | 0 | } |
411 | | } |
412 | | |
413 | | #[cfg(test)] |
414 | | mod test { |
415 | | use std::sync::OnceLock; |
416 | | |
417 | | use super::*; |
418 | | use crate::planner::logical2physical; |
419 | | |
420 | | use arrow_schema::{DataType, Field, Schema, SchemaRef}; |
421 | | use datafusion_expr::expr_fn::*; |
422 | | use datafusion_expr::{lit, Expr}; |
423 | | |
424 | | use itertools::Itertools; |
425 | | |
426 | | #[test] |
427 | | fn test_literal() { |
428 | | // a single literal offers no guarantee |
429 | | test_analyze(lit(true), vec![]) |
430 | | } |
431 | | |
432 | | #[test] |
433 | | fn test_single() { |
434 | | // a = "foo" |
435 | | test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]); |
436 | | // "foo" = a |
437 | | test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]); |
438 | | // a != "foo" |
439 | | test_analyze( |
440 | | col("a").not_eq(lit("foo")), |
441 | | vec![not_in_guarantee("a", ["foo"])], |
442 | | ); |
443 | | // "foo" != a |
444 | | test_analyze( |
445 | | lit("foo").not_eq(col("a")), |
446 | | vec![not_in_guarantee("a", ["foo"])], |
447 | | ); |
448 | | } |
449 | | |
450 | | #[test] |
451 | | fn test_conjunction_single_column() { |
452 | | // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false |
453 | | test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]); |
454 | | // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`. |
455 | | test_analyze( |
456 | | col("b").eq(lit(1)).and(col("b").not_eq(lit(2))), |
457 | | vec![ |
458 | | // can only be true of b is 1 and b is not 2 (even though it is redundant) |
459 | | in_guarantee("b", [1]), |
460 | | not_in_guarantee("b", [2]), |
461 | | ], |
462 | | ); |
463 | | // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`. |
464 | | test_analyze( |
465 | | col("b").not_eq(lit(1)).and(col("b").eq(lit(2))), |
466 | | vec![ |
467 | | // can only be true of b is not 1 and b is 2 (even though it is redundant) |
468 | | not_in_guarantee("b", [1]), |
469 | | in_guarantee("b", [2]), |
470 | | ], |
471 | | ); |
472 | | // b != 1 AND b != 2 |
473 | | test_analyze( |
474 | | col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))), |
475 | | vec![not_in_guarantee("b", [1, 2])], |
476 | | ); |
477 | | // b != 1 AND b != 2 and b != 3 |
478 | | test_analyze( |
479 | | col("b") |
480 | | .not_eq(lit(1)) |
481 | | .and(col("b").not_eq(lit(2))) |
482 | | .and(col("b").not_eq(lit(3))), |
483 | | vec![not_in_guarantee("b", [1, 2, 3])], |
484 | | ); |
485 | | // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3) |
486 | | test_analyze( |
487 | | col("b") |
488 | | .not_eq(lit(1)) |
489 | | .and(col("b").eq(lit(2))) |
490 | | .and(col("b").not_eq(lit(3))), |
491 | | vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])], |
492 | | ); |
493 | | // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3) |
494 | | test_analyze( |
495 | | col("b") |
496 | | .not_eq(lit(1)) |
497 | | .and(col("b").not_eq(lit(2))) |
498 | | .and(col("b").eq(lit(3))), |
499 | | vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])], |
500 | | ); |
501 | | // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2 |
502 | | test_analyze( |
503 | | col("b") |
504 | | .not_eq(lit(1)) |
505 | | .and(col("b").not_eq(lit(2))) |
506 | | .and(col("b").gt(lit(3))), |
507 | | vec![not_in_guarantee("b", [1, 2])], |
508 | | ); |
509 | | } |
510 | | |
511 | | #[test] |
512 | | fn test_conjunction_multi_column() { |
513 | | // a = "foo" AND b = 1 |
514 | | test_analyze( |
515 | | col("a").eq(lit("foo")).and(col("b").eq(lit(1))), |
516 | | vec![ |
517 | | // should find both column guarantees |
518 | | in_guarantee("a", ["foo"]), |
519 | | in_guarantee("b", [1]), |
520 | | ], |
521 | | ); |
522 | | // a != "foo" AND b != 1 |
523 | | test_analyze( |
524 | | col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))), |
525 | | // should find both column guarantees |
526 | | vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])], |
527 | | ); |
528 | | // a = "foo" AND a = "bar" |
529 | | test_analyze( |
530 | | col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))), |
531 | | // this predicate is impossible ( can't be both foo and bar), |
532 | | vec![], |
533 | | ); |
534 | | // a = "foo" AND b != "bar" |
535 | | test_analyze( |
536 | | col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))), |
537 | | vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])], |
538 | | ); |
539 | | // a != "foo" AND a != "bar" |
540 | | test_analyze( |
541 | | col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))), |
542 | | // know it isn't "foo" or "bar" |
543 | | vec![not_in_guarantee("a", ["foo", "bar"])], |
544 | | ); |
545 | | // a != "foo" AND a != "bar" and a != "baz" |
546 | | test_analyze( |
547 | | col("a") |
548 | | .not_eq(lit("foo")) |
549 | | .and(col("a").not_eq(lit("bar"))) |
550 | | .and(col("a").not_eq(lit("baz"))), |
551 | | // know it isn't "foo" or "bar" or "baz" |
552 | | vec![not_in_guarantee("a", ["foo", "bar", "baz"])], |
553 | | ); |
554 | | // a = "foo" AND a = "foo" |
555 | | let expr = col("a").eq(lit("foo")); |
556 | | test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]); |
557 | | // b > 5 AND b = 10 (should get an b = 10 guarantee) |
558 | | test_analyze( |
559 | | col("b").gt(lit(5)).and(col("b").eq(lit(10))), |
560 | | vec![in_guarantee("b", [10])], |
561 | | ); |
562 | | // b > 10 AND b = 10 (this is impossible) |
563 | | test_analyze( |
564 | | col("b").gt(lit(10)).and(col("b").eq(lit(10))), |
565 | | vec![ |
566 | | // if b isn't 10, it can not be true (though the expression actually can never be true) |
567 | | in_guarantee("b", [10]), |
568 | | ], |
569 | | ); |
570 | | // a != "foo" and (a != "bar" OR a != "baz") |
571 | | test_analyze( |
572 | | col("a") |
573 | | .not_eq(lit("foo")) |
574 | | .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))), |
575 | | // a is not foo (we can't represent other knowledge about a) |
576 | | vec![not_in_guarantee("a", ["foo"])], |
577 | | ); |
578 | | } |
579 | | |
580 | | #[test] |
581 | | fn test_conjunction_and_disjunction_single_column() { |
582 | | // b != 1 AND (b > 2) |
583 | | test_analyze( |
584 | | col("b").not_eq(lit(1)).and(col("b").gt(lit(2))), |
585 | | vec![ |
586 | | // for the expression to be true, b can not be one |
587 | | not_in_guarantee("b", [1]), |
588 | | ], |
589 | | ); |
590 | | |
591 | | // b = 1 AND (b = 2 OR b = 3). Could be simplified to false. |
592 | | test_analyze( |
593 | | col("b") |
594 | | .eq(lit(1)) |
595 | | .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))), |
596 | | vec![ |
597 | | // in theory, b must be 1 and one of 2,3 for this expression to be true |
598 | | // which is a logical contradiction |
599 | | ], |
600 | | ); |
601 | | } |
602 | | |
603 | | #[test] |
604 | | fn test_disjunction_single_column() { |
605 | | // b = 1 OR b = 2 |
606 | | test_analyze( |
607 | | col("b").eq(lit(1)).or(col("b").eq(lit(2))), |
608 | | vec![in_guarantee("b", [1, 2])], |
609 | | ); |
610 | | // b != 1 OR b = 2 |
611 | | test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]); |
612 | | // b = 1 OR b != 2 |
613 | | test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]); |
614 | | // b != 1 OR b != 2 |
615 | | test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]); |
616 | | // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3 |
617 | | test_analyze( |
618 | | col("b") |
619 | | .not_eq(lit(1)) |
620 | | .or(col("b").not_eq(lit(2))) |
621 | | .or(lit("b").eq(lit(3))), |
622 | | vec![], |
623 | | ); |
624 | | // b = 1 OR b = 2 OR b = 3 |
625 | | test_analyze( |
626 | | col("b") |
627 | | .eq(lit(1)) |
628 | | .or(col("b").eq(lit(2))) |
629 | | .or(col("b").eq(lit(3))), |
630 | | vec![in_guarantee("b", [1, 2, 3])], |
631 | | ); |
632 | | // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2) |
633 | | test_analyze( |
634 | | col("b") |
635 | | .eq(lit(1)) |
636 | | .or(col("b").eq(lit(2))) |
637 | | .or(lit("b").eq(lit(3))), |
638 | | vec![], |
639 | | ); |
640 | | } |
641 | | |
642 | | #[test] |
643 | | fn test_disjunction_multi_column() { |
644 | | // a = "foo" OR b = 1 |
645 | | test_analyze( |
646 | | col("a").eq(lit("foo")).or(col("b").eq(lit(1))), |
647 | | // no can't have a single column guarantee (if a = "foo" then b != 1) etc |
648 | | vec![], |
649 | | ); |
650 | | // a != "foo" OR b != 1 |
651 | | test_analyze( |
652 | | col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))), |
653 | | // No single column guarantee |
654 | | vec![], |
655 | | ); |
656 | | // a = "foo" OR a = "bar" |
657 | | test_analyze( |
658 | | col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))), |
659 | | vec![in_guarantee("a", ["foo", "bar"])], |
660 | | ); |
661 | | // a = "foo" OR a = "foo" |
662 | | test_analyze( |
663 | | col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))), |
664 | | vec![in_guarantee("a", ["foo"])], |
665 | | ); |
666 | | // a != "foo" OR a != "bar" |
667 | | test_analyze( |
668 | | col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))), |
669 | | // can't represent knowledge about a in this case |
670 | | vec![], |
671 | | ); |
672 | | // a = "foo" OR a = "bar" OR a = "baz" |
673 | | test_analyze( |
674 | | col("a") |
675 | | .eq(lit("foo")) |
676 | | .or(col("a").eq(lit("bar"))) |
677 | | .or(col("a").eq(lit("baz"))), |
678 | | vec![in_guarantee("a", ["foo", "bar", "baz"])], |
679 | | ); |
680 | | // (a = "foo" OR a = "bar") AND (a = "baz)" |
681 | | test_analyze( |
682 | | (col("a").eq(lit("foo")).or(col("a").eq(lit("bar")))) |
683 | | .and(col("a").eq(lit("baz"))), |
684 | | // this could potentially be represented as 2 constraints with a more |
685 | | // sophisticated analysis |
686 | | vec![], |
687 | | ); |
688 | | // (a = "foo" OR a = "bar") AND (b = 1) |
689 | | test_analyze( |
690 | | (col("a").eq(lit("foo")).or(col("a").eq(lit("bar")))) |
691 | | .and(col("b").eq(lit(1))), |
692 | | vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])], |
693 | | ); |
694 | | // (a = "foo" OR a = "bar") OR (b = 1) |
695 | | test_analyze( |
696 | | col("a") |
697 | | .eq(lit("foo")) |
698 | | .or(col("a").eq(lit("bar"))) |
699 | | .or(col("b").eq(lit(1))), |
700 | | // can't represent knowledge about a or b in this case |
701 | | vec![], |
702 | | ); |
703 | | } |
704 | | |
705 | | #[test] |
706 | | fn test_single_inlist() { |
707 | | // b IN (1, 2, 3) |
708 | | test_analyze( |
709 | | col("b").in_list(vec![lit(1), lit(2), lit(3)], false), |
710 | | vec![in_guarantee("b", [1, 2, 3])], |
711 | | ); |
712 | | // b NOT IN (1, 2, 3) |
713 | | test_analyze( |
714 | | col("b").in_list(vec![lit(1), lit(2), lit(3)], true), |
715 | | vec![not_in_guarantee("b", [1, 2, 3])], |
716 | | ); |
717 | | // b IN (1,2,3,4...24) |
718 | | test_analyze( |
719 | | col("b").in_list((1..25).map(lit).collect_vec(), false), |
720 | | vec![in_guarantee("b", 1..25)], |
721 | | ); |
722 | | } |
723 | | |
724 | | #[test] |
725 | | fn test_inlist_conjunction() { |
726 | | // b IN (1, 2, 3) AND b IN (2, 3, 4) |
727 | | test_analyze( |
728 | | col("b") |
729 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
730 | | .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)), |
731 | | vec![in_guarantee("b", [2, 3])], |
732 | | ); |
733 | | // b NOT IN (1, 2, 3) AND b IN (2, 3, 4) |
734 | | test_analyze( |
735 | | col("b") |
736 | | .in_list(vec![lit(1), lit(2), lit(3)], true) |
737 | | .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)), |
738 | | vec![ |
739 | | not_in_guarantee("b", [1, 2, 3]), |
740 | | in_guarantee("b", [2, 3, 4]), |
741 | | ], |
742 | | ); |
743 | | // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4) |
744 | | test_analyze( |
745 | | col("b") |
746 | | .in_list(vec![lit(1), lit(2), lit(3)], true) |
747 | | .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)), |
748 | | vec![not_in_guarantee("b", [1, 2, 3, 4])], |
749 | | ); |
750 | | // b IN (1, 2, 3) AND b = 4 |
751 | | test_analyze( |
752 | | col("b") |
753 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
754 | | .and(col("b").eq(lit(4))), |
755 | | vec![], |
756 | | ); |
757 | | // b IN (1, 2, 3) AND b = 2 |
758 | | test_analyze( |
759 | | col("b") |
760 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
761 | | .and(col("b").eq(lit(2))), |
762 | | vec![in_guarantee("b", [2])], |
763 | | ); |
764 | | // b IN (1, 2, 3) AND b != 2 |
765 | | test_analyze( |
766 | | col("b") |
767 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
768 | | .and(col("b").not_eq(lit(2))), |
769 | | vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])], |
770 | | ); |
771 | | // b NOT IN (1, 2, 3) AND b != 4 |
772 | | test_analyze( |
773 | | col("b") |
774 | | .in_list(vec![lit(1), lit(2), lit(3)], true) |
775 | | .and(col("b").not_eq(lit(4))), |
776 | | vec![not_in_guarantee("b", [1, 2, 3, 4])], |
777 | | ); |
778 | | // b NOT IN (1, 2, 3) AND b != 2 |
779 | | test_analyze( |
780 | | col("b") |
781 | | .in_list(vec![lit(1), lit(2), lit(3)], true) |
782 | | .and(col("b").not_eq(lit(2))), |
783 | | vec![not_in_guarantee("b", [1, 2, 3])], |
784 | | ); |
785 | | } |
786 | | |
787 | | #[test] |
788 | | fn test_inlist_with_disjunction() { |
789 | | // b IN (1, 2, 3) AND (b = 3 OR b = 4) |
790 | | test_analyze( |
791 | | col("b") |
792 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
793 | | .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))), |
794 | | vec![in_guarantee("b", [3])], |
795 | | ); |
796 | | // b IN (1, 2, 3) AND (b = 4 OR b = 5) |
797 | | test_analyze( |
798 | | col("b") |
799 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
800 | | .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))), |
801 | | vec![], |
802 | | ); |
803 | | // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4) |
804 | | test_analyze( |
805 | | col("b") |
806 | | .in_list(vec![lit(1), lit(2), lit(3)], true) |
807 | | .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))), |
808 | | vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])], |
809 | | ); |
810 | | // b IN (1, 2, 3) OR b = 2 |
811 | | // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to anylize this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported. |
812 | | test_analyze( |
813 | | col("b") |
814 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
815 | | .or(col("b").eq(lit(2))), |
816 | | vec![], |
817 | | ); |
818 | | // b IN (1, 2, 3) OR b != 3 |
819 | | test_analyze( |
820 | | col("b") |
821 | | .in_list(vec![lit(1), lit(2), lit(3)], false) |
822 | | .or(col("b").not_eq(lit(3))), |
823 | | vec![], |
824 | | ); |
825 | | } |
826 | | |
827 | | /// Tests that analyzing expr results in the expected guarantees |
828 | | fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) { |
829 | | println!("Begin analyze of {expr}"); |
830 | | let schema = schema(); |
831 | | let physical_expr = logical2physical(&expr, &schema); |
832 | | |
833 | | let actual = LiteralGuarantee::analyze(&physical_expr); |
834 | | assert_eq!( |
835 | | expected, actual, |
836 | | "expr: {expr}\ |
837 | | \n\nexpected: {expected:#?}\ |
838 | | \n\nactual: {actual:#?}\ |
839 | | \n\nexpr: {expr:#?}\ |
840 | | \n\nphysical_expr: {physical_expr:#?}" |
841 | | ); |
842 | | } |
843 | | |
844 | | /// Guarantee that the expression is true if the column is one of the specified values |
845 | | fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee |
846 | | where |
847 | | I: IntoIterator<Item = S>, |
848 | | S: Into<ScalarValue> + 'a, |
849 | | { |
850 | | let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); |
851 | | LiteralGuarantee::new(column, Guarantee::In, literals.iter()) |
852 | | } |
853 | | |
854 | | /// Guarantee that the expression is true if the column is NOT any of the specified values |
855 | | fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee |
856 | | where |
857 | | I: IntoIterator<Item = S>, |
858 | | S: Into<ScalarValue> + 'a, |
859 | | { |
860 | | let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect(); |
861 | | LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter()) |
862 | | } |
863 | | |
864 | | // Schema for testing |
865 | | fn schema() -> SchemaRef { |
866 | | Arc::clone(SCHEMA.get_or_init(|| { |
867 | | Arc::new(Schema::new(vec![ |
868 | | Field::new("a", DataType::Utf8, false), |
869 | | Field::new("b", DataType::Int32, false), |
870 | | ])) |
871 | | })) |
872 | | } |
873 | | |
874 | | static SCHEMA: OnceLock<SchemaRef> = OnceLock::new(); |
875 | | } |