Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/utils/guarantee.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! [`LiteralGuarantee`] predicate analysis to determine if a column is a
19
//! constant.
20
21
use crate::utils::split_disjunction;
22
use crate::{split_conjunction, PhysicalExpr};
23
use datafusion_common::{Column, ScalarValue};
24
use datafusion_expr::Operator;
25
use std::collections::{HashMap, HashSet};
26
use std::fmt::{self, Display, Formatter};
27
use std::sync::Arc;
28
29
/// Represents a guarantee that must be true for a boolean expression to
30
/// evaluate to `true`.
31
///
32
/// The guarantee takes the form of a column and a set of literal (constant)
33
/// [`ScalarValue`]s. For the expression to evaluate to `true`, the column *must
34
/// satisfy* the guarantee(s).
35
///
36
/// To satisfy the guarantee, depending on [`Guarantee`], the values in the
37
/// column must either:
38
///
39
/// 1. be ONLY one of that set
40
/// 2. NOT be ANY of that set
41
///
42
/// # Uses `LiteralGuarantee`s
43
///
44
/// `LiteralGuarantee`s can be used to simplify filter expressions and skip data
45
/// files (e.g. row groups in parquet files) by proving expressions can not
46
/// possibly evaluate to `true`. For example, if we have a guarantee that `a`
47
/// must be in (`1`) for a filter to evaluate to `true`, then we can skip any
48
/// partition where we know that `a` never has the value of `1`.
49
///
50
/// **Important**: If a `LiteralGuarantee` is not satisfied, the relevant
51
/// expression is *guaranteed* to evaluate to `false` or `null`. **However**,
52
/// the opposite does not hold. Even if all `LiteralGuarantee`s are satisfied,
53
/// that does **not** guarantee that the predicate will actually evaluate to
54
/// `true`: it may still evaluate to `true`, `false` or `null`.
55
///
56
/// # Creating `LiteralGuarantee`s
57
///
58
/// Use [`LiteralGuarantee::analyze`] to extract literal guarantees from a
59
/// filter predicate.
60
///
61
/// # Details
62
/// A guarantee can be one of two forms:
63
///
64
/// 1. The column must be one the values for the predicate to be `true`. If the
65
///    column takes on any other value, the predicate can not evaluate to `true`.
66
///    For example,
67
///    `(a = 1)`, `(a = 1 OR a = 2)` or `a IN (1, 2, 3)`
68
///
69
/// 2. The column must NOT be one of the values for the predicate to be `true`.
70
///    If the column can ONLY take one of these values, the predicate can not
71
///    evaluate to `true`. For example,
72
///    `(a != 1)`, `(a != 1 AND a != 2)` or `a NOT IN (1, 2, 3)`
73
#[derive(Debug, Clone, PartialEq)]
74
pub struct LiteralGuarantee {
75
    pub column: Column,
76
    pub guarantee: Guarantee,
77
    pub literals: HashSet<ScalarValue>,
78
}
79
80
/// What is guaranteed about the values for a [`LiteralGuarantee`]?
81
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
82
pub enum Guarantee {
83
    /// Guarantee that the expression is `true` if `column` is one of the values. If
84
    /// `column` is not one of the values, the expression can not be `true`.
85
    In,
86
    /// Guarantee that the expression is `true` if `column` is not ANY of the
87
    /// values. If `column` only takes one of these values, the expression can
88
    /// not be `true`.
89
    NotIn,
90
}
91
92
impl LiteralGuarantee {
93
    /// Create a new instance of the guarantee if the provided operator is
94
    /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to
95
    /// create these structures from an predicate (boolean expression).
96
0
    fn new<'a>(
97
0
        column_name: impl Into<String>,
98
0
        guarantee: Guarantee,
99
0
        literals: impl IntoIterator<Item = &'a ScalarValue>,
100
0
    ) -> Self {
101
0
        let literals: HashSet<_> = literals.into_iter().cloned().collect();
102
0
103
0
        Self {
104
0
            column: Column::from_name(column_name),
105
0
            guarantee,
106
0
            literals,
107
0
        }
108
0
    }
109
110
    /// Return a list of [`LiteralGuarantee`]s that must be satisfied for `expr`
111
    /// to evaluate to `true`.
112
    ///
113
    /// If more than one `LiteralGuarantee` is returned, they must **all** hold
114
    /// for the expression to possibly be `true`. If any is not satisfied, the
115
    /// expression is guaranteed to be `null` or `false`.
116
    ///
117
    /// # Notes:
118
    /// 1. `expr` must be a boolean expression or inlist expression.
119
    /// 2. `expr` is not simplified prior to analysis.
120
0
    pub fn analyze(expr: &Arc<dyn PhysicalExpr>) -> Vec<LiteralGuarantee> {
121
0
        // split conjunction: <expr> AND <expr> AND ...
122
0
        split_conjunction(expr)
123
0
            .into_iter()
124
0
            // for an `AND` conjunction to be true, all terms individually must be true
125
0
            .fold(GuaranteeBuilder::new(), |builder, expr| {
126
0
                if let Some(cel) = ColOpLit::try_new(expr) {
127
0
                    return builder.aggregate_conjunct(cel);
128
0
                } else if let Some(inlist) = expr
129
0
                    .as_any()
130
0
                    .downcast_ref::<crate::expressions::InListExpr>()
131
                {
132
                    // Only support single-column inlist currently, multi-column inlist is not supported
133
0
                    let col = inlist
134
0
                        .expr()
135
0
                        .as_any()
136
0
                        .downcast_ref::<crate::expressions::Column>();
137
0
                    let Some(col) = col else {
138
0
                        return builder;
139
                    };
140
141
0
                    let literals = inlist
142
0
                        .list()
143
0
                        .iter()
144
0
                        .map(|e| e.as_any().downcast_ref::<crate::expressions::Literal>())
145
0
                        .collect::<Option<Vec<_>>>();
146
0
                    let Some(literals) = literals else {
147
0
                        return builder;
148
                    };
149
150
0
                    let guarantee = if inlist.negated() {
151
0
                        Guarantee::NotIn
152
                    } else {
153
0
                        Guarantee::In
154
                    };
155
156
0
                    builder.aggregate_multi_conjunct(
157
0
                        col,
158
0
                        guarantee,
159
0
                        literals.iter().map(|e| e.value()),
160
0
                    )
161
                } else {
162
                    // split disjunction: <expr> OR <expr> OR ...
163
0
                    let disjunctions = split_disjunction(expr);
164
0
165
0
                    // We are trying to add a guarantee that a column must be
166
0
                    // in/not in a particular set of values for the expression
167
0
                    // to evaluate to true.
168
0
                    //
169
0
                    // A disjunction is true, if at least one of the terms is be
170
0
                    // true.
171
0
                    //
172
0
                    // Thus, we can infer a guarantee if all terms are of the
173
0
                    // form `(col <op> literal) OR (col <op> literal) OR ...`.
174
0
                    //
175
0
                    // For example, we can infer that `a = 1 OR a = 2 OR a = 3`
176
0
                    // is guaranteed to be true ONLY if a is in (`1`, `2` or `3`).
177
0
                    //
178
0
                    // However, for something like  `a = 1 OR a = 2 OR a < 0` we
179
0
                    // **can't** guarantee that the predicate is only true if a
180
0
                    // is in (`1`, `2`), as it could also be true if `a` were less
181
0
                    // than zero.
182
0
                    let terms = disjunctions
183
0
                        .iter()
184
0
                        .filter_map(|expr| ColOpLit::try_new(expr))
185
0
                        .collect::<Vec<_>>();
186
0
187
0
                    if terms.is_empty() {
188
0
                        return builder;
189
0
                    }
190
0
191
0
                    // if not all terms are of the form (col <op> literal),
192
0
                    // can't infer any guarantees
193
0
                    if terms.len() != disjunctions.len() {
194
0
                        return builder;
195
0
                    }
196
0
197
0
                    // if all terms are 'col <op> literal' with the same column
198
0
                    // and operation we can infer any guarantees
199
0
                    //
200
0
                    // For those like (a != foo AND (a != bar OR a != baz)).
201
0
                    // We can't combine the (a != bar OR a != baz) part, but
202
0
                    // it also doesn't invalidate our knowledge that a !=
203
0
                    // foo is required for the expression to be true.
204
0
                    // So we can only create a multi value guarantee for `=`
205
0
                    // (or a single value). (e.g. ignore `a != foo OR a != bar`)
206
0
                    let first_term = &terms[0];
207
0
                    if terms.iter().all(|term| {
208
0
                        term.col.name() == first_term.col.name()
209
0
                            && term.guarantee == Guarantee::In
210
0
                    }) {
211
0
                        builder.aggregate_multi_conjunct(
212
0
                            first_term.col,
213
0
                            Guarantee::In,
214
0
                            terms.iter().map(|term| term.lit.value()),
215
0
                        )
216
                    } else {
217
                        // can't infer anything
218
0
                        builder
219
                    }
220
                }
221
0
            })
222
0
            .build()
223
0
    }
224
}
225
226
impl Display for LiteralGuarantee {
227
0
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
228
0
        let mut sorted_literals: Vec<_> =
229
0
            self.literals.iter().map(|lit| lit.to_string()).collect();
230
0
        sorted_literals.sort();
231
0
        match self.guarantee {
232
0
            Guarantee::In => write!(
233
0
                f,
234
0
                "{} in ({})",
235
0
                self.column.name,
236
0
                sorted_literals.join(", ")
237
0
            ),
238
0
            Guarantee::NotIn => write!(
239
0
                f,
240
0
                "{} not in ({})",
241
0
                self.column.name,
242
0
                sorted_literals.join(", ")
243
0
            ),
244
        }
245
0
    }
246
}
247
248
/// Combines conjuncts (aka terms `AND`ed together) into [`LiteralGuarantee`]s,
249
/// preserving insert order
250
#[derive(Debug, Default)]
251
struct GuaranteeBuilder<'a> {
252
    /// List of guarantees that have been created so far
253
    /// if we have determined a subsequent conjunct invalidates a guarantee
254
    /// e.g. `a = foo AND a = bar` then the relevant guarantee will be None
255
    guarantees: Vec<Option<LiteralGuarantee>>,
256
257
    /// Key is the (column name, guarantee type)
258
    /// Value is the index into `guarantees`
259
    map: HashMap<(&'a crate::expressions::Column, Guarantee), usize>,
260
}
261
262
impl<'a> GuaranteeBuilder<'a> {
263
0
    fn new() -> Self {
264
0
        Default::default()
265
0
    }
266
267
    /// Aggregate a new single `AND col <op> literal` term to this builder
268
    /// combining with existing guarantees if possible.
269
    ///
270
    /// # Examples
271
    /// * `AND (a = 1)`: `a` is guaranteed to be 1
272
    /// * `AND (a != 1)`: a is guaranteed to not be 1
273
0
    fn aggregate_conjunct(self, col_op_lit: ColOpLit<'a>) -> Self {
274
0
        self.aggregate_multi_conjunct(
275
0
            col_op_lit.col,
276
0
            col_op_lit.guarantee,
277
0
            [col_op_lit.lit.value()],
278
0
        )
279
0
    }
280
281
    /// Aggregates a new single column, multi literal term to this builder
282
    /// combining with previously known guarantees if possible.
283
    ///
284
    /// # Examples
285
    /// For the following examples, we can guarantee the expression is `true` if:
286
    /// * `AND (a = 1 OR a = 2 OR a = 3)`: a is in (1, 2, or 3)
287
    /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3)
288
    /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3)
289
    /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3)
290
0
    fn aggregate_multi_conjunct(
291
0
        mut self,
292
0
        col: &'a crate::expressions::Column,
293
0
        guarantee: Guarantee,
294
0
        new_values: impl IntoIterator<Item = &'a ScalarValue>,
295
0
    ) -> Self {
296
0
        let key = (col, guarantee);
297
0
        if let Some(index) = self.map.get(&key) {
298
            // already have a guarantee for this column
299
0
            let entry = &mut self.guarantees[*index];
300
301
0
            let Some(existing) = entry else {
302
                // determined the previous guarantee for this column has been
303
                // invalidated, nothing to do
304
0
                return self;
305
            };
306
307
            // Combine conjuncts if we have `a != foo AND a != bar`. `a = foo
308
            // AND a = bar` doesn't make logical sense so we don't optimize this
309
            // case
310
0
            match existing.guarantee {
311
                // knew that the column could not be a set of values
312
                //
313
                // For example, if we previously had `a != 5` and now we see
314
                // another `AND a != 6` we know that a must not be either 5 or 6
315
                // for the expression to be true
316
0
                Guarantee::NotIn => {
317
0
                    let new_values: HashSet<_> = new_values.into_iter().collect();
318
0
                    existing.literals.extend(new_values.into_iter().cloned());
319
0
                }
320
                Guarantee::In => {
321
0
                    let intersection = new_values
322
0
                        .into_iter()
323
0
                        .filter(|new_value| existing.literals.contains(*new_value))
324
0
                        .collect::<Vec<_>>();
325
0
                    // for an In guarantee, if the intersection is not empty,  we can extend the guarantee
326
0
                    // e.g. `a IN (1,2,3) AND a IN (2,3,4)` is `a IN (2,3)`
327
0
                    // otherwise, we invalidate the guarantee
328
0
                    // e.g. `a IN (1,2,3) AND a IN (4,5,6)` is `a IN ()`, which is invalid
329
0
                    if !intersection.is_empty() {
330
0
                        existing.literals = intersection.into_iter().cloned().collect();
331
0
                    } else {
332
0
                        // at least one was not, so invalidate the guarantee
333
0
                        *entry = None;
334
0
                    }
335
                }
336
            }
337
0
        } else {
338
0
            // This is a new guarantee
339
0
            let new_values: HashSet<_> = new_values.into_iter().collect();
340
0
341
0
            let guarantee = LiteralGuarantee::new(col.name(), guarantee, new_values);
342
0
            // add it to the list of guarantees
343
0
            self.guarantees.push(Some(guarantee));
344
0
            self.map.insert(key, self.guarantees.len() - 1);
345
0
        }
346
347
0
        self
348
0
    }
349
350
    /// Return all guarantees that have been created so far
351
0
    fn build(self) -> Vec<LiteralGuarantee> {
352
0
        // filter out any guarantees that have been invalidated
353
0
        self.guarantees.into_iter().flatten().collect()
354
0
    }
355
}
356
357
/// Represents a single `col [not]in literal` expression
358
struct ColOpLit<'a> {
359
    col: &'a crate::expressions::Column,
360
    guarantee: Guarantee,
361
    lit: &'a crate::expressions::Literal,
362
}
363
364
impl<'a> ColOpLit<'a> {
365
    /// Returns Some(ColEqLit) if the expression is either:
366
    /// 1. `col <op> literal`
367
    /// 2. `literal <op> col`
368
    /// 3. operator is `=` or `!=`
369
    ///
370
    /// Returns None otherwise
371
0
    fn try_new(expr: &'a Arc<dyn PhysicalExpr>) -> Option<Self> {
372
0
        let binary_expr = expr
373
0
            .as_any()
374
0
            .downcast_ref::<crate::expressions::BinaryExpr>()?;
375
376
0
        let (left, op, right) = (
377
0
            binary_expr.left().as_any(),
378
0
            binary_expr.op(),
379
0
            binary_expr.right().as_any(),
380
0
        );
381
0
        let guarantee = match op {
382
0
            Operator::Eq => Guarantee::In,
383
0
            Operator::NotEq => Guarantee::NotIn,
384
0
            _ => return None,
385
        };
386
        // col <op> literal
387
0
        if let (Some(col), Some(lit)) = (
388
0
            left.downcast_ref::<crate::expressions::Column>(),
389
0
            right.downcast_ref::<crate::expressions::Literal>(),
390
        ) {
391
0
            Some(Self {
392
0
                col,
393
0
                guarantee,
394
0
                lit,
395
0
            })
396
        }
397
        // literal <op> col
398
0
        else if let (Some(lit), Some(col)) = (
399
0
            left.downcast_ref::<crate::expressions::Literal>(),
400
0
            right.downcast_ref::<crate::expressions::Column>(),
401
        ) {
402
0
            Some(Self {
403
0
                col,
404
0
                guarantee,
405
0
                lit,
406
0
            })
407
        } else {
408
0
            None
409
        }
410
0
    }
411
}
412
413
#[cfg(test)]
414
mod test {
415
    use std::sync::OnceLock;
416
417
    use super::*;
418
    use crate::planner::logical2physical;
419
420
    use arrow_schema::{DataType, Field, Schema, SchemaRef};
421
    use datafusion_expr::expr_fn::*;
422
    use datafusion_expr::{lit, Expr};
423
424
    use itertools::Itertools;
425
426
    #[test]
427
    fn test_literal() {
428
        // a single literal offers no guarantee
429
        test_analyze(lit(true), vec![])
430
    }
431
432
    #[test]
433
    fn test_single() {
434
        // a = "foo"
435
        test_analyze(col("a").eq(lit("foo")), vec![in_guarantee("a", ["foo"])]);
436
        // "foo" = a
437
        test_analyze(lit("foo").eq(col("a")), vec![in_guarantee("a", ["foo"])]);
438
        // a != "foo"
439
        test_analyze(
440
            col("a").not_eq(lit("foo")),
441
            vec![not_in_guarantee("a", ["foo"])],
442
        );
443
        // "foo" != a
444
        test_analyze(
445
            lit("foo").not_eq(col("a")),
446
            vec![not_in_guarantee("a", ["foo"])],
447
        );
448
    }
449
450
    #[test]
451
    fn test_conjunction_single_column() {
452
        // b = 1 AND b = 2. This is impossible. Ideally this expression could be simplified to false
453
        test_analyze(col("b").eq(lit(1)).and(col("b").eq(lit(2))), vec![]);
454
        // b = 1 AND b != 2 . In theory, this could be simplified to `b = 1`.
455
        test_analyze(
456
            col("b").eq(lit(1)).and(col("b").not_eq(lit(2))),
457
            vec![
458
                // can only be true of b is 1 and b is not 2 (even though it is redundant)
459
                in_guarantee("b", [1]),
460
                not_in_guarantee("b", [2]),
461
            ],
462
        );
463
        // b != 1 AND b = 2. In theory, this could be simplified to `b = 2`.
464
        test_analyze(
465
            col("b").not_eq(lit(1)).and(col("b").eq(lit(2))),
466
            vec![
467
                // can only be true of b is not 1 and b is 2 (even though it is redundant)
468
                not_in_guarantee("b", [1]),
469
                in_guarantee("b", [2]),
470
            ],
471
        );
472
        // b != 1 AND b != 2
473
        test_analyze(
474
            col("b").not_eq(lit(1)).and(col("b").not_eq(lit(2))),
475
            vec![not_in_guarantee("b", [1, 2])],
476
        );
477
        // b != 1 AND b != 2 and b != 3
478
        test_analyze(
479
            col("b")
480
                .not_eq(lit(1))
481
                .and(col("b").not_eq(lit(2)))
482
                .and(col("b").not_eq(lit(3))),
483
            vec![not_in_guarantee("b", [1, 2, 3])],
484
        );
485
        // b != 1 AND b = 2 and b != 3. Can only be true if b is 2 and b is not in (1, 3)
486
        test_analyze(
487
            col("b")
488
                .not_eq(lit(1))
489
                .and(col("b").eq(lit(2)))
490
                .and(col("b").not_eq(lit(3))),
491
            vec![not_in_guarantee("b", [1, 3]), in_guarantee("b", [2])],
492
        );
493
        // b != 1 AND b != 2 and b = 3 (in theory could determine b = 3)
494
        test_analyze(
495
            col("b")
496
                .not_eq(lit(1))
497
                .and(col("b").not_eq(lit(2)))
498
                .and(col("b").eq(lit(3))),
499
            vec![not_in_guarantee("b", [1, 2]), in_guarantee("b", [3])],
500
        );
501
        // b != 1 AND b != 2 and b > 3 (to be true, b can't be either 1 or 2
502
        test_analyze(
503
            col("b")
504
                .not_eq(lit(1))
505
                .and(col("b").not_eq(lit(2)))
506
                .and(col("b").gt(lit(3))),
507
            vec![not_in_guarantee("b", [1, 2])],
508
        );
509
    }
510
511
    #[test]
512
    fn test_conjunction_multi_column() {
513
        // a = "foo" AND b = 1
514
        test_analyze(
515
            col("a").eq(lit("foo")).and(col("b").eq(lit(1))),
516
            vec![
517
                // should find both column guarantees
518
                in_guarantee("a", ["foo"]),
519
                in_guarantee("b", [1]),
520
            ],
521
        );
522
        // a != "foo" AND b != 1
523
        test_analyze(
524
            col("a").not_eq(lit("foo")).and(col("b").not_eq(lit(1))),
525
            // should find both column guarantees
526
            vec![not_in_guarantee("a", ["foo"]), not_in_guarantee("b", [1])],
527
        );
528
        // a = "foo" AND a = "bar"
529
        test_analyze(
530
            col("a").eq(lit("foo")).and(col("a").eq(lit("bar"))),
531
            // this predicate is impossible ( can't be both foo and bar),
532
            vec![],
533
        );
534
        // a = "foo" AND b != "bar"
535
        test_analyze(
536
            col("a").eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
537
            vec![in_guarantee("a", ["foo"]), not_in_guarantee("a", ["bar"])],
538
        );
539
        // a != "foo" AND a != "bar"
540
        test_analyze(
541
            col("a").not_eq(lit("foo")).and(col("a").not_eq(lit("bar"))),
542
            // know it isn't "foo" or "bar"
543
            vec![not_in_guarantee("a", ["foo", "bar"])],
544
        );
545
        // a != "foo" AND a != "bar" and a != "baz"
546
        test_analyze(
547
            col("a")
548
                .not_eq(lit("foo"))
549
                .and(col("a").not_eq(lit("bar")))
550
                .and(col("a").not_eq(lit("baz"))),
551
            // know it isn't "foo" or "bar" or "baz"
552
            vec![not_in_guarantee("a", ["foo", "bar", "baz"])],
553
        );
554
        // a = "foo" AND a = "foo"
555
        let expr = col("a").eq(lit("foo"));
556
        test_analyze(expr.clone().and(expr), vec![in_guarantee("a", ["foo"])]);
557
        // b > 5 AND b = 10 (should get an b = 10 guarantee)
558
        test_analyze(
559
            col("b").gt(lit(5)).and(col("b").eq(lit(10))),
560
            vec![in_guarantee("b", [10])],
561
        );
562
        // b > 10 AND b = 10 (this is impossible)
563
        test_analyze(
564
            col("b").gt(lit(10)).and(col("b").eq(lit(10))),
565
            vec![
566
                //  if b isn't 10, it can not be true (though the expression actually can never be true)
567
                in_guarantee("b", [10]),
568
            ],
569
        );
570
        // a != "foo" and (a != "bar" OR a != "baz")
571
        test_analyze(
572
            col("a")
573
                .not_eq(lit("foo"))
574
                .and(col("a").not_eq(lit("bar")).or(col("a").not_eq(lit("baz")))),
575
            // a is not foo (we can't represent other knowledge about a)
576
            vec![not_in_guarantee("a", ["foo"])],
577
        );
578
    }
579
580
    #[test]
581
    fn test_conjunction_and_disjunction_single_column() {
582
        // b != 1 AND (b > 2)
583
        test_analyze(
584
            col("b").not_eq(lit(1)).and(col("b").gt(lit(2))),
585
            vec![
586
                // for the expression to be true, b can not be one
587
                not_in_guarantee("b", [1]),
588
            ],
589
        );
590
591
        // b = 1 AND (b = 2 OR b = 3). Could be simplified to false.
592
        test_analyze(
593
            col("b")
594
                .eq(lit(1))
595
                .and(col("b").eq(lit(2)).or(col("b").eq(lit(3)))),
596
            vec![
597
                // in theory, b must be 1 and one of 2,3 for this expression to be true
598
                // which is a logical contradiction
599
            ],
600
        );
601
    }
602
603
    #[test]
604
    fn test_disjunction_single_column() {
605
        // b = 1 OR b = 2
606
        test_analyze(
607
            col("b").eq(lit(1)).or(col("b").eq(lit(2))),
608
            vec![in_guarantee("b", [1, 2])],
609
        );
610
        // b != 1 OR b = 2
611
        test_analyze(col("b").not_eq(lit(1)).or(col("b").eq(lit(2))), vec![]);
612
        // b = 1 OR b != 2
613
        test_analyze(col("b").eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
614
        // b != 1 OR b != 2
615
        test_analyze(col("b").not_eq(lit(1)).or(col("b").not_eq(lit(2))), vec![]);
616
        // b != 1 OR b != 2 OR b = 3 -- in theory could guarantee that b = 3
617
        test_analyze(
618
            col("b")
619
                .not_eq(lit(1))
620
                .or(col("b").not_eq(lit(2)))
621
                .or(lit("b").eq(lit(3))),
622
            vec![],
623
        );
624
        // b = 1 OR b = 2 OR b = 3
625
        test_analyze(
626
            col("b")
627
                .eq(lit(1))
628
                .or(col("b").eq(lit(2)))
629
                .or(col("b").eq(lit(3))),
630
            vec![in_guarantee("b", [1, 2, 3])],
631
        );
632
        // b = 1 OR b = 2 OR b > 3 -- can't guarantee that the expression is only true if a is in (1, 2)
633
        test_analyze(
634
            col("b")
635
                .eq(lit(1))
636
                .or(col("b").eq(lit(2)))
637
                .or(lit("b").eq(lit(3))),
638
            vec![],
639
        );
640
    }
641
642
    #[test]
643
    fn test_disjunction_multi_column() {
644
        // a = "foo" OR b = 1
645
        test_analyze(
646
            col("a").eq(lit("foo")).or(col("b").eq(lit(1))),
647
            // no can't have a single column guarantee (if a = "foo" then b != 1) etc
648
            vec![],
649
        );
650
        // a != "foo" OR b != 1
651
        test_analyze(
652
            col("a").not_eq(lit("foo")).or(col("b").not_eq(lit(1))),
653
            // No single column guarantee
654
            vec![],
655
        );
656
        // a = "foo" OR a = "bar"
657
        test_analyze(
658
            col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))),
659
            vec![in_guarantee("a", ["foo", "bar"])],
660
        );
661
        // a = "foo" OR a = "foo"
662
        test_analyze(
663
            col("a").eq(lit("foo")).or(col("a").eq(lit("foo"))),
664
            vec![in_guarantee("a", ["foo"])],
665
        );
666
        // a != "foo" OR a != "bar"
667
        test_analyze(
668
            col("a").not_eq(lit("foo")).or(col("a").not_eq(lit("bar"))),
669
            // can't represent knowledge about a in this case
670
            vec![],
671
        );
672
        // a = "foo" OR a = "bar" OR a = "baz"
673
        test_analyze(
674
            col("a")
675
                .eq(lit("foo"))
676
                .or(col("a").eq(lit("bar")))
677
                .or(col("a").eq(lit("baz"))),
678
            vec![in_guarantee("a", ["foo", "bar", "baz"])],
679
        );
680
        // (a = "foo" OR a = "bar") AND (a = "baz)"
681
        test_analyze(
682
            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
683
                .and(col("a").eq(lit("baz"))),
684
            // this could potentially be represented as 2 constraints with a more
685
            // sophisticated analysis
686
            vec![],
687
        );
688
        // (a = "foo" OR a = "bar") AND (b = 1)
689
        test_analyze(
690
            (col("a").eq(lit("foo")).or(col("a").eq(lit("bar"))))
691
                .and(col("b").eq(lit(1))),
692
            vec![in_guarantee("a", ["foo", "bar"]), in_guarantee("b", [1])],
693
        );
694
        // (a = "foo" OR a = "bar") OR (b = 1)
695
        test_analyze(
696
            col("a")
697
                .eq(lit("foo"))
698
                .or(col("a").eq(lit("bar")))
699
                .or(col("b").eq(lit(1))),
700
            // can't represent knowledge about a or b in this case
701
            vec![],
702
        );
703
    }
704
705
    #[test]
706
    fn test_single_inlist() {
707
        // b IN (1, 2, 3)
708
        test_analyze(
709
            col("b").in_list(vec![lit(1), lit(2), lit(3)], false),
710
            vec![in_guarantee("b", [1, 2, 3])],
711
        );
712
        // b NOT IN (1, 2, 3)
713
        test_analyze(
714
            col("b").in_list(vec![lit(1), lit(2), lit(3)], true),
715
            vec![not_in_guarantee("b", [1, 2, 3])],
716
        );
717
        // b IN (1,2,3,4...24)
718
        test_analyze(
719
            col("b").in_list((1..25).map(lit).collect_vec(), false),
720
            vec![in_guarantee("b", 1..25)],
721
        );
722
    }
723
724
    #[test]
725
    fn test_inlist_conjunction() {
726
        // b IN (1, 2, 3) AND b IN (2, 3, 4)
727
        test_analyze(
728
            col("b")
729
                .in_list(vec![lit(1), lit(2), lit(3)], false)
730
                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
731
            vec![in_guarantee("b", [2, 3])],
732
        );
733
        // b NOT IN (1, 2, 3) AND b IN (2, 3, 4)
734
        test_analyze(
735
            col("b")
736
                .in_list(vec![lit(1), lit(2), lit(3)], true)
737
                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], false)),
738
            vec![
739
                not_in_guarantee("b", [1, 2, 3]),
740
                in_guarantee("b", [2, 3, 4]),
741
            ],
742
        );
743
        // b NOT IN (1, 2, 3) AND b NOT IN (2, 3, 4)
744
        test_analyze(
745
            col("b")
746
                .in_list(vec![lit(1), lit(2), lit(3)], true)
747
                .and(col("b").in_list(vec![lit(2), lit(3), lit(4)], true)),
748
            vec![not_in_guarantee("b", [1, 2, 3, 4])],
749
        );
750
        // b IN (1, 2, 3) AND b = 4
751
        test_analyze(
752
            col("b")
753
                .in_list(vec![lit(1), lit(2), lit(3)], false)
754
                .and(col("b").eq(lit(4))),
755
            vec![],
756
        );
757
        // b IN (1, 2, 3) AND b = 2
758
        test_analyze(
759
            col("b")
760
                .in_list(vec![lit(1), lit(2), lit(3)], false)
761
                .and(col("b").eq(lit(2))),
762
            vec![in_guarantee("b", [2])],
763
        );
764
        // b IN (1, 2, 3) AND b != 2
765
        test_analyze(
766
            col("b")
767
                .in_list(vec![lit(1), lit(2), lit(3)], false)
768
                .and(col("b").not_eq(lit(2))),
769
            vec![in_guarantee("b", [1, 2, 3]), not_in_guarantee("b", [2])],
770
        );
771
        // b NOT IN (1, 2, 3) AND b != 4
772
        test_analyze(
773
            col("b")
774
                .in_list(vec![lit(1), lit(2), lit(3)], true)
775
                .and(col("b").not_eq(lit(4))),
776
            vec![not_in_guarantee("b", [1, 2, 3, 4])],
777
        );
778
        // b NOT IN (1, 2, 3) AND b != 2
779
        test_analyze(
780
            col("b")
781
                .in_list(vec![lit(1), lit(2), lit(3)], true)
782
                .and(col("b").not_eq(lit(2))),
783
            vec![not_in_guarantee("b", [1, 2, 3])],
784
        );
785
    }
786
787
    #[test]
788
    fn test_inlist_with_disjunction() {
789
        // b IN (1, 2, 3) AND (b = 3 OR b = 4)
790
        test_analyze(
791
            col("b")
792
                .in_list(vec![lit(1), lit(2), lit(3)], false)
793
                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
794
            vec![in_guarantee("b", [3])],
795
        );
796
        // b IN (1, 2, 3) AND (b = 4 OR b = 5)
797
        test_analyze(
798
            col("b")
799
                .in_list(vec![lit(1), lit(2), lit(3)], false)
800
                .and(col("b").eq(lit(4)).or(col("b").eq(lit(5)))),
801
            vec![],
802
        );
803
        // b NOT IN (1, 2, 3) AND (b = 3 OR b = 4)
804
        test_analyze(
805
            col("b")
806
                .in_list(vec![lit(1), lit(2), lit(3)], true)
807
                .and(col("b").eq(lit(3)).or(col("b").eq(lit(4)))),
808
            vec![not_in_guarantee("b", [1, 2, 3]), in_guarantee("b", [3, 4])],
809
        );
810
        // b IN (1, 2, 3) OR b = 2
811
        // TODO this should be in_guarantee("b", [1, 2, 3]) but currently we don't support to anylize this kind of disjunction. Only `ColOpLit OR ColOpLit` is supported.
812
        test_analyze(
813
            col("b")
814
                .in_list(vec![lit(1), lit(2), lit(3)], false)
815
                .or(col("b").eq(lit(2))),
816
            vec![],
817
        );
818
        // b IN (1, 2, 3) OR b != 3
819
        test_analyze(
820
            col("b")
821
                .in_list(vec![lit(1), lit(2), lit(3)], false)
822
                .or(col("b").not_eq(lit(3))),
823
            vec![],
824
        );
825
    }
826
827
    /// Tests that analyzing expr results in the expected guarantees
828
    fn test_analyze(expr: Expr, expected: Vec<LiteralGuarantee>) {
829
        println!("Begin analyze of {expr}");
830
        let schema = schema();
831
        let physical_expr = logical2physical(&expr, &schema);
832
833
        let actual = LiteralGuarantee::analyze(&physical_expr);
834
        assert_eq!(
835
            expected, actual,
836
            "expr: {expr}\
837
               \n\nexpected: {expected:#?}\
838
               \n\nactual: {actual:#?}\
839
               \n\nexpr: {expr:#?}\
840
               \n\nphysical_expr: {physical_expr:#?}"
841
        );
842
    }
843
844
    /// Guarantee that the expression is true if the column is one of the specified values
845
    fn in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
846
    where
847
        I: IntoIterator<Item = S>,
848
        S: Into<ScalarValue> + 'a,
849
    {
850
        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
851
        LiteralGuarantee::new(column, Guarantee::In, literals.iter())
852
    }
853
854
    /// Guarantee that the expression is true if the column is NOT any of the specified values
855
    fn not_in_guarantee<'a, I, S>(column: &str, literals: I) -> LiteralGuarantee
856
    where
857
        I: IntoIterator<Item = S>,
858
        S: Into<ScalarValue> + 'a,
859
    {
860
        let literals: Vec<_> = literals.into_iter().map(|s| s.into()).collect();
861
        LiteralGuarantee::new(column, Guarantee::NotIn, literals.iter())
862
    }
863
864
    // Schema for testing
865
    fn schema() -> SchemaRef {
866
        Arc::clone(SCHEMA.get_or_init(|| {
867
            Arc::new(Schema::new(vec![
868
                Field::new("a", DataType::Utf8, false),
869
                Field::new("b", DataType::Int32, false),
870
            ]))
871
        }))
872
    }
873
874
    static SCHEMA: OnceLock<SchemaRef> = OnceLock::new();
875
}