Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/filter.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::any::Any;
19
use std::pin::Pin;
20
use std::sync::Arc;
21
use std::task::{ready, Context, Poll};
22
23
use super::{
24
    ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties,
25
    RecordBatchStream, SendableRecordBatchStream, Statistics,
26
};
27
use crate::common::can_project;
28
use crate::{
29
    metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
30
    DisplayFormatType, ExecutionPlan,
31
};
32
33
use arrow::compute::filter_record_batch;
34
use arrow::datatypes::{DataType, SchemaRef};
35
use arrow::record_batch::RecordBatch;
36
use datafusion_common::cast::as_boolean_array;
37
use datafusion_common::stats::Precision;
38
use datafusion_common::{
39
    internal_err, plan_err, project_schema, DataFusionError, Result,
40
};
41
use datafusion_execution::TaskContext;
42
use datafusion_expr::Operator;
43
use datafusion_physical_expr::equivalence::ProjectionMapping;
44
use datafusion_physical_expr::expressions::BinaryExpr;
45
use datafusion_physical_expr::intervals::utils::check_support;
46
use datafusion_physical_expr::utils::collect_columns;
47
use datafusion_physical_expr::{
48
    analyze, split_conjunction, AnalysisContext, ConstExpr, ExprBoundaries, PhysicalExpr,
49
};
50
51
use futures::stream::{Stream, StreamExt};
52
use log::trace;
53
54
/// FilterExec evaluates a boolean predicate against all input batches to determine which rows to
55
/// include in its output batches.
56
#[derive(Debug)]
57
pub struct FilterExec {
58
    /// The expression to filter on. This expression must evaluate to a boolean value.
59
    predicate: Arc<dyn PhysicalExpr>,
60
    /// The input plan
61
    input: Arc<dyn ExecutionPlan>,
62
    /// Execution metrics
63
    metrics: ExecutionPlanMetricsSet,
64
    /// Selectivity for statistics. 0 = no rows, 100 = all rows
65
    default_selectivity: u8,
66
    /// Properties equivalence properties, partitioning, etc.
67
    cache: PlanProperties,
68
    /// The projection indices of the columns in the output schema of join
69
    projection: Option<Vec<usize>>,
70
}
71
72
impl FilterExec {
73
    /// Create a FilterExec on an input
74
16
    pub fn try_new(
75
16
        predicate: Arc<dyn PhysicalExpr>,
76
16
        input: Arc<dyn ExecutionPlan>,
77
16
    ) -> Result<Self> {
78
16
        match predicate.data_type(input.schema().as_ref())
?0
{
79
            DataType::Boolean => {
80
16
                let default_selectivity = 20;
81
16
                let cache = Self::compute_properties(
82
16
                    &input,
83
16
                    &predicate,
84
16
                    default_selectivity,
85
16
                    None,
86
16
                )
?0
;
87
16
                Ok(Self {
88
16
                    predicate,
89
16
                    input: Arc::clone(&input),
90
16
                    metrics: ExecutionPlanMetricsSet::new(),
91
16
                    default_selectivity,
92
16
                    cache,
93
16
                    projection: None,
94
16
                })
95
            }
96
0
            other => {
97
0
                plan_err!("Filter predicate must return BOOLEAN values, got {other:?}")
98
            }
99
        }
100
16
    }
101
102
2
    pub fn with_default_selectivity(
103
2
        mut self,
104
2
        default_selectivity: u8,
105
2
    ) -> Result<Self, DataFusionError> {
106
2
        if default_selectivity > 100 {
107
1
            return plan_err!(
108
1
                "Default filter selectivity value needs to be less than or equal to 100"
109
1
            );
110
1
        }
111
1
        self.default_selectivity = default_selectivity;
112
1
        Ok(self)
113
2
    }
114
115
    /// Return new instance of [FilterExec] with the given projection.
116
0
    pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> {
117
0
        //  check if the projection is valid
118
0
        can_project(&self.schema(), projection.as_ref())?;
119
120
0
        let projection = match projection {
121
0
            Some(projection) => match &self.projection {
122
0
                Some(p) => Some(projection.iter().map(|i| p[*i]).collect()),
123
0
                None => Some(projection),
124
            },
125
0
            None => None,
126
        };
127
128
0
        let cache = Self::compute_properties(
129
0
            &self.input,
130
0
            &self.predicate,
131
0
            self.default_selectivity,
132
0
            projection.as_ref(),
133
0
        )?;
134
0
        Ok(Self {
135
0
            predicate: Arc::clone(&self.predicate),
136
0
            input: Arc::clone(&self.input),
137
0
            metrics: self.metrics.clone(),
138
0
            default_selectivity: self.default_selectivity,
139
0
            cache,
140
0
            projection,
141
0
        })
142
0
    }
143
144
    /// The expression to filter on. This expression must evaluate to a boolean value.
145
20
    pub fn predicate(&self) -> &Arc<dyn PhysicalExpr> {
146
20
        &self.predicate
147
20
    }
148
149
    /// The input plan
150
0
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
151
0
        &self.input
152
0
    }
153
154
    /// The default selectivity
155
0
    pub fn default_selectivity(&self) -> u8 {
156
0
        self.default_selectivity
157
0
    }
158
159
    /// projection
160
0
    pub fn projection(&self) -> Option<&Vec<usize>> {
161
0
        self.projection.as_ref()
162
0
    }
163
164
    /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics.
165
36
    fn statistics_helper(
166
36
        input: &Arc<dyn ExecutionPlan>,
167
36
        predicate: &Arc<dyn PhysicalExpr>,
168
36
        default_selectivity: u8,
169
36
    ) -> Result<Statistics> {
170
36
        let input_stats = input.statistics()
?0
;
171
36
        let schema = input.schema();
172
36
        if !check_support(predicate, &schema) {
173
3
            let selectivity = default_selectivity as f64 / 100.0;
174
3
            let mut stats = input_stats.to_inexact();
175
3
            stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity);
176
3
            stats.total_byte_size = stats
177
3
                .total_byte_size
178
3
                .with_estimated_selectivity(selectivity);
179
3
            return Ok(stats);
180
33
        }
181
33
182
33
        let num_rows = input_stats.num_rows;
183
33
        let total_byte_size = input_stats.total_byte_size;
184
33
        let input_analysis_ctx = AnalysisContext::try_from_statistics(
185
33
            &input.schema(),
186
33
            &input_stats.column_statistics,
187
33
        )
?0
;
188
189
33
        let analysis_ctx = analyze(predicate, input_analysis_ctx, &schema)
?0
;
190
191
        // Estimate (inexact) selectivity of predicate
192
33
        let selectivity = analysis_ctx.selectivity.unwrap_or(1.0);
193
33
        let num_rows = num_rows.with_estimated_selectivity(selectivity);
194
33
        let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity);
195
33
196
33
        let column_statistics = collect_new_statistics(
197
33
            &input_stats.column_statistics,
198
33
            analysis_ctx.boundaries,
199
33
        );
200
33
        Ok(Statistics {
201
33
            num_rows,
202
33
            total_byte_size,
203
33
            column_statistics,
204
33
        })
205
36
    }
206
207
16
    fn extend_constants(
208
16
        input: &Arc<dyn ExecutionPlan>,
209
16
        predicate: &Arc<dyn PhysicalExpr>,
210
16
    ) -> Vec<ConstExpr> {
211
16
        let mut res_constants = Vec::new();
212
16
        let input_eqs = input.equivalence_properties();
213
16
214
16
        let conjunctions = split_conjunction(predicate);
215
39
        for 
conjunction23
in conjunctions {
216
23
            if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() {
217
23
                if binary.op() == &Operator::Eq {
218
                    // Filter evaluates to single value for all partitions
219
4
                    if input_eqs.is_expr_constant(binary.left()) {
220
0
                        res_constants.push(
221
0
                            ConstExpr::from(binary.right()).with_across_partitions(true),
222
0
                        )
223
4
                    } else if input_eqs.is_expr_constant(binary.right()) {
224
4
                        res_constants.push(
225
4
                            ConstExpr::from(binary.left()).with_across_partitions(true),
226
4
                        )
227
0
                    }
228
19
                }
229
0
            }
230
        }
231
16
        res_constants
232
16
    }
233
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
234
16
    fn compute_properties(
235
16
        input: &Arc<dyn ExecutionPlan>,
236
16
        predicate: &Arc<dyn PhysicalExpr>,
237
16
        default_selectivity: u8,
238
16
        projection: Option<&Vec<usize>>,
239
16
    ) -> Result<PlanProperties> {
240
        // Combine the equal predicates with the input equivalence properties
241
        // to construct the equivalence properties:
242
16
        let stats = Self::statistics_helper(input, predicate, default_selectivity)
?0
;
243
16
        let mut eq_properties = input.equivalence_properties().clone();
244
16
        let (equal_pairs, _) = collect_columns_from_predicate(predicate);
245
20
        for (
lhs, rhs4
) in equal_pairs {
246
4
            eq_properties.add_equal_conditions(lhs, rhs)
?0
247
        }
248
        // Add the columns that have only one viable value (singleton) after
249
        // filtering to constants.
250
16
        let constants = collect_columns(predicate)
251
16
            .into_iter()
252
20
            .filter(|column| stats.column_statistics[column.index()].is_singleton())
253
16
            .map(|column| {
254
3
                let expr = Arc::new(column) as _;
255
3
                ConstExpr::new(expr).with_across_partitions(true)
256
16
            });
257
16
        // this is for statistics
258
16
        eq_properties = eq_properties.with_constants(constants);
259
16
        // this is for logical constant (for example: a = '1', then a could be marked as a constant)
260
16
        // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0)
261
16
        eq_properties =
262
16
            eq_properties.with_constants(Self::extend_constants(input, predicate));
263
16
264
16
        let mut output_partitioning = input.output_partitioning().clone();
265
        // If contains projection, update the PlanProperties.
266
16
        if let Some(
projection0
) = projection {
267
0
            let schema = eq_properties.schema();
268
0
            let projection_mapping = ProjectionMapping::from_indices(projection, schema)?;
269
0
            let out_schema = project_schema(schema, Some(projection))?;
270
0
            output_partitioning =
271
0
                output_partitioning.project(&projection_mapping, &eq_properties);
272
0
            eq_properties = eq_properties.project(&projection_mapping, out_schema);
273
16
        }
274
16
        Ok(PlanProperties::new(
275
16
            eq_properties,
276
16
            output_partitioning,
277
16
            input.execution_mode(),
278
16
        ))
279
16
    }
280
}
281
282
impl DisplayAs for FilterExec {
283
0
    fn fmt_as(
284
0
        &self,
285
0
        t: DisplayFormatType,
286
0
        f: &mut std::fmt::Formatter,
287
0
    ) -> std::fmt::Result {
288
0
        match t {
289
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
290
0
                let display_projections = if let Some(projection) =
291
0
                    self.projection.as_ref()
292
                {
293
0
                    format!(
294
0
                        ", projection=[{}]",
295
0
                        projection
296
0
                            .iter()
297
0
                            .map(|index| format!(
298
0
                                "{}@{}",
299
0
                                self.input.schema().fields().get(*index).unwrap().name(),
300
0
                                index
301
0
                            ))
302
0
                            .collect::<Vec<_>>()
303
0
                            .join(", ")
304
0
                    )
305
                } else {
306
0
                    "".to_string()
307
                };
308
0
                write!(f, "FilterExec: {}{}", self.predicate, display_projections)
309
0
            }
310
0
        }
311
0
    }
312
}
313
314
impl ExecutionPlan for FilterExec {
315
0
    fn name(&self) -> &'static str {
316
0
        "FilterExec"
317
0
    }
318
319
    /// Return a reference to Any that can be used for downcasting
320
0
    fn as_any(&self) -> &dyn Any {
321
0
        self
322
0
    }
323
324
29
    fn properties(&self) -> &PlanProperties {
325
29
        &self.cache
326
29
    }
327
328
0
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
329
0
        vec![&self.input]
330
0
    }
331
332
0
    fn maintains_input_order(&self) -> Vec<bool> {
333
0
        // tell optimizer this operator doesn't reorder its input
334
0
        vec![true]
335
0
    }
336
337
0
    fn with_new_children(
338
0
        self: Arc<Self>,
339
0
        mut children: Vec<Arc<dyn ExecutionPlan>>,
340
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
341
0
        FilterExec::try_new(Arc::clone(&self.predicate), children.swap_remove(0))
342
0
            .and_then(|e| {
343
0
                let selectivity = e.default_selectivity();
344
0
                e.with_default_selectivity(selectivity)
345
0
            })
346
0
            .and_then(|e| e.with_projection(self.projection().cloned()))
347
0
            .map(|e| Arc::new(e) as _)
348
0
    }
349
350
0
    fn execute(
351
0
        &self,
352
0
        partition: usize,
353
0
        context: Arc<TaskContext>,
354
0
    ) -> Result<SendableRecordBatchStream> {
355
0
        trace!("Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
356
0
        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
357
0
        Ok(Box::pin(FilterExecStream {
358
0
            schema: self.schema(),
359
0
            predicate: Arc::clone(&self.predicate),
360
0
            input: self.input.execute(partition, context)?,
361
0
            baseline_metrics,
362
0
            projection: self.projection.clone(),
363
        }))
364
0
    }
365
366
0
    fn metrics(&self) -> Option<MetricsSet> {
367
0
        Some(self.metrics.clone_inner())
368
0
    }
369
370
    /// The output statistics of a filtering operation can be estimated if the
371
    /// predicate's selectivity value can be determined for the incoming data.
372
20
    fn statistics(&self) -> Result<Statistics> {
373
20
        Self::statistics_helper(&self.input, self.predicate(), self.default_selectivity)
374
20
    }
375
}
376
377
/// This function ensures that all bounds in the `ExprBoundaries` vector are
378
/// converted to closed bounds. If a lower/upper bound is initially open, it
379
/// is adjusted by using the next/previous value for its data type to convert
380
/// it into a closed bound.
381
33
fn collect_new_statistics(
382
33
    input_column_stats: &[ColumnStatistics],
383
33
    analysis_boundaries: Vec<ExprBoundaries>,
384
33
) -> Vec<ColumnStatistics> {
385
33
    analysis_boundaries
386
33
        .into_iter()
387
33
        .enumerate()
388
33
        .map(
389
33
            |(
390
                idx,
391
                ExprBoundaries {
392
                    interval,
393
                    distinct_count,
394
                    ..
395
                },
396
54
            )| {
397
54
                let (lower, upper) = interval.into_bounds();
398
54
                let (min_value, max_value) = if lower.eq(&upper) {
399
7
                    (Precision::Exact(lower), Precision::Exact(upper))
400
                } else {
401
47
                    (Precision::Inexact(lower), Precision::Inexact(upper))
402
                };
403
54
                ColumnStatistics {
404
54
                    null_count: input_column_stats[idx].null_count.to_inexact(),
405
54
                    max_value,
406
54
                    min_value,
407
54
                    distinct_count: distinct_count.to_inexact(),
408
54
                }
409
54
            },
410
33
        )
411
33
        .collect()
412
33
}
413
414
/// The FilterExec streams wraps the input iterator and applies the predicate expression to
415
/// determine which rows to include in its output batches
416
struct FilterExecStream {
417
    /// Output schema after the projection
418
    schema: SchemaRef,
419
    /// The expression to filter on. This expression must evaluate to a boolean value.
420
    predicate: Arc<dyn PhysicalExpr>,
421
    /// The input partition to filter.
422
    input: SendableRecordBatchStream,
423
    /// runtime metrics recording
424
    baseline_metrics: BaselineMetrics,
425
    /// The projection indices of the columns in the input schema
426
    projection: Option<Vec<usize>>,
427
}
428
429
0
pub fn batch_filter(
430
0
    batch: &RecordBatch,
431
0
    predicate: &Arc<dyn PhysicalExpr>,
432
0
) -> Result<RecordBatch> {
433
0
    filter_and_project(batch, predicate, None, &batch.schema())
434
0
}
435
436
0
fn filter_and_project(
437
0
    batch: &RecordBatch,
438
0
    predicate: &Arc<dyn PhysicalExpr>,
439
0
    projection: Option<&Vec<usize>>,
440
0
    output_schema: &SchemaRef,
441
0
) -> Result<RecordBatch> {
442
0
    predicate
443
0
        .evaluate(batch)
444
0
        .and_then(|v| v.into_array(batch.num_rows()))
445
0
        .and_then(|array| {
446
0
            Ok(match (as_boolean_array(&array), projection) {
447
                // apply filter array to record batch
448
0
                (Ok(filter_array), None) => filter_record_batch(batch, filter_array)?,
449
0
                (Ok(filter_array), Some(projection)) => {
450
0
                    let projected_columns = projection
451
0
                        .iter()
452
0
                        .map(|i| Arc::clone(batch.column(*i)))
453
0
                        .collect();
454
0
                    let projected_batch = RecordBatch::try_new(
455
0
                        Arc::clone(output_schema),
456
0
                        projected_columns,
457
0
                    )?;
458
0
                    filter_record_batch(&projected_batch, filter_array)?
459
                }
460
                (Err(_), _) => {
461
0
                    return internal_err!(
462
0
                        "Cannot create filter_array from non-boolean predicates"
463
0
                    );
464
                }
465
            })
466
0
        })
467
0
}
468
469
impl Stream for FilterExecStream {
470
    type Item = Result<RecordBatch>;
471
472
0
    fn poll_next(
473
0
        mut self: Pin<&mut Self>,
474
0
        cx: &mut Context<'_>,
475
0
    ) -> Poll<Option<Self::Item>> {
476
        let poll;
477
        loop {
478
0
            match ready!(self.input.poll_next_unpin(cx)) {
479
0
                Some(Ok(batch)) => {
480
0
                    let timer = self.baseline_metrics.elapsed_compute().timer();
481
0
                    let filtered_batch = filter_and_project(
482
0
                        &batch,
483
0
                        &self.predicate,
484
0
                        self.projection.as_ref(),
485
0
                        &self.schema,
486
0
                    )?;
487
0
                    timer.done();
488
0
                    // skip entirely filtered batches
489
0
                    if filtered_batch.num_rows() == 0 {
490
0
                        continue;
491
0
                    }
492
0
                    poll = Poll::Ready(Some(Ok(filtered_batch)));
493
0
                    break;
494
                }
495
0
                value => {
496
0
                    poll = Poll::Ready(value);
497
0
                    break;
498
                }
499
            }
500
        }
501
0
        self.baseline_metrics.record_poll(poll)
502
0
    }
503
504
0
    fn size_hint(&self) -> (usize, Option<usize>) {
505
0
        // same number of record batches
506
0
        self.input.size_hint()
507
0
    }
508
}
509
510
impl RecordBatchStream for FilterExecStream {
511
0
    fn schema(&self) -> SchemaRef {
512
0
        Arc::clone(&self.schema)
513
0
    }
514
}
515
516
/// Return the equals Column-Pairs and Non-equals Column-Pairs
517
17
fn collect_columns_from_predicate(predicate: &Arc<dyn PhysicalExpr>) -> EqualAndNonEqual {
518
17
    let mut eq_predicate_columns = Vec::<PhysicalExprPairRef>::new();
519
17
    let mut ne_predicate_columns = Vec::<PhysicalExprPairRef>::new();
520
17
521
17
    let predicates = split_conjunction(predicate);
522
27
    predicates.into_iter().for_each(|p| {
523
27
        if let Some(binary) = p.as_any().downcast_ref::<BinaryExpr>() {
524
27
            match binary.op() {
525
                Operator::Eq => {
526
6
                    eq_predicate_columns.push((binary.left(), binary.right()))
527
                }
528
                Operator::NotEq => {
529
1
                    ne_predicate_columns.push((binary.left(), binary.right()))
530
                }
531
20
                _ => {}
532
            }
533
0
        }
534
27
    });
535
17
536
17
    (eq_predicate_columns, ne_predicate_columns)
537
17
}
538
539
/// Pair of `Arc<dyn PhysicalExpr>`s
540
pub type PhysicalExprPairRef<'a> = (&'a Arc<dyn PhysicalExpr>, &'a Arc<dyn PhysicalExpr>);
541
542
/// The equals Column-Pairs and Non-equals Column-Pairs in the Predicates
543
pub type EqualAndNonEqual<'a> =
544
    (Vec<PhysicalExprPairRef<'a>>, Vec<PhysicalExprPairRef<'a>>);
545
546
#[cfg(test)]
547
mod tests {
548
    use super::*;
549
    use crate::empty::EmptyExec;
550
    use crate::expressions::*;
551
    use crate::test;
552
    use crate::test::exec::StatisticsExec;
553
554
    use arrow::datatypes::{Field, Schema};
555
    use arrow_schema::{UnionFields, UnionMode};
556
    use datafusion_common::ScalarValue;
557
558
    #[tokio::test]
559
1
    async fn collect_columns_predicates() -> Result<()> {
560
1
        let schema = test::aggr_test_schema();
561
1
        let predicate: Arc<dyn PhysicalExpr> = binary(
562
1
            binary(
563
1
                binary(col("c2", &schema)
?0
, Operator::GtEq, lit(1u32), &schema)
?0
,
564
1
                Operator::And,
565
1
                binary(col("c2", &schema)
?0
, Operator::Eq, lit(4u32), &schema)
?0
,
566
1
                &schema,
567
1
            )
?0
,
568
1
            Operator::And,
569
1
            binary(
570
1
                binary(
571
1
                    col("c2", &schema)
?0
,
572
1
                    Operator::Eq,
573
1
                    col("c9", &schema)
?0
,
574
1
                    &schema,
575
1
                )
?0
,
576
1
                Operator::And,
577
1
                binary(
578
1
                    col("c1", &schema)
?0
,
579
1
                    Operator::NotEq,
580
1
                    col("c13", &schema)
?0
,
581
1
                    &schema,
582
1
                )
?0
,
583
1
                &schema,
584
1
            )
?0
,
585
1
            &schema,
586
1
        )
?0
;
587
1
588
1
        let (equal_pairs, ne_pairs) = collect_columns_from_predicate(&predicate);
589
1
        assert_eq!(2, equal_pairs.len());
590
1
        assert!(equal_pairs[0].0.eq(&col("c2", &schema)
?0
));
591
1
        assert!(equal_pairs[0].1.eq(&lit(4u32)));
592
1
593
1
        assert!(equal_pairs[1].0.eq(&col("c2", &schema)
?0
));
594
1
        assert!(equal_pairs[1].1.eq(&col("c9", &schema)
?0
));
595
1
596
1
        assert_eq!(1, ne_pairs.len());
597
1
        assert!(ne_pairs[0].0.eq(&col("c1", &schema)
?0
));
598
1
        assert!(ne_pairs[0].1.eq(&col("c13", &schema)
?0
));
599
1
600
1
        Ok(())
601
1
    }
602
603
    #[tokio::test]
604
1
    async fn test_filter_statistics_basic_expr() -> Result<()> {
605
1
        // Table:
606
1
        //      a: min=1, max=100
607
1
        let bytes_per_row = 4;
608
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
609
1
        let input = Arc::new(StatisticsExec::new(
610
1
            Statistics {
611
1
                num_rows: Precision::Inexact(100),
612
1
                total_byte_size: Precision::Inexact(100 * bytes_per_row),
613
1
                column_statistics: vec![ColumnStatistics {
614
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
615
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
616
1
                    ..Default::default()
617
1
                }],
618
1
            },
619
1
            schema.clone(),
620
1
        ));
621
1
622
1
        // a <= 25
623
1
        let predicate: Arc<dyn PhysicalExpr> =
624
1
            binary(col("a", &schema)
?0
, Operator::LtEq, lit(25i32), &schema)
?0
;
625
1
626
1
        // WHERE a <= 25
627
1
        let filter: Arc<dyn ExecutionPlan> =
628
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
629
1
630
1
        let statistics = filter.statistics()
?0
;
631
1
        assert_eq!(statistics.num_rows, Precision::Inexact(25));
632
1
        assert_eq!(
633
1
            statistics.total_byte_size,
634
1
            Precision::Inexact(25 * bytes_per_row)
635
1
        );
636
1
        assert_eq!(
637
1
            statistics.column_statistics,
638
1
            vec![ColumnStatistics {
639
1
                min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
640
1
                max_value: Precision::Inexact(ScalarValue::Int32(Some(25))),
641
1
                ..Default::default()
642
1
            }]
643
1
        );
644
1
645
1
        Ok(())
646
1
    }
647
648
    #[tokio::test]
649
1
    async fn test_filter_statistics_column_level_nested() -> Result<()> {
650
1
        // Table:
651
1
        //      a: min=1, max=100
652
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
653
1
        let input = Arc::new(StatisticsExec::new(
654
1
            Statistics {
655
1
                num_rows: Precision::Inexact(100),
656
1
                column_statistics: vec![ColumnStatistics {
657
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
658
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
659
1
                    ..Default::default()
660
1
                }],
661
1
                total_byte_size: Precision::Absent,
662
1
            },
663
1
            schema.clone(),
664
1
        ));
665
1
666
1
        // WHERE a <= 25
667
1
        let sub_filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
668
1
            binary(col("a", &schema)
?0
, Operator::LtEq, lit(25i32), &schema)
?0
,
669
1
            input,
670
1
        )
?0
);
671
1
672
1
        // Nested filters (two separate physical plans, instead of AND chain in the expr)
673
1
        // WHERE a >= 10
674
1
        // WHERE a <= 25
675
1
        let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
676
1
            binary(col("a", &schema)
?0
, Operator::GtEq, lit(10i32), &schema)
?0
,
677
1
            sub_filter,
678
1
        )
?0
);
679
1
680
1
        let statistics = filter.statistics()
?0
;
681
1
        assert_eq!(statistics.num_rows, Precision::Inexact(16));
682
1
        assert_eq!(
683
1
            statistics.column_statistics,
684
1
            vec![ColumnStatistics {
685
1
                min_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
686
1
                max_value: Precision::Inexact(ScalarValue::Int32(Some(25))),
687
1
                ..Default::default()
688
1
            }]
689
1
        );
690
1
691
1
        Ok(())
692
1
    }
693
694
    #[tokio::test]
695
1
    async fn test_filter_statistics_column_level_nested_multiple() -> Result<()> {
696
1
        // Table:
697
1
        //      a: min=1, max=100
698
1
        //      b: min=1, max=50
699
1
        let schema = Schema::new(vec![
700
1
            Field::new("a", DataType::Int32, false),
701
1
            Field::new("b", DataType::Int32, false),
702
1
        ]);
703
1
        let input = Arc::new(StatisticsExec::new(
704
1
            Statistics {
705
1
                num_rows: Precision::Inexact(100),
706
1
                column_statistics: vec![
707
1
                    ColumnStatistics {
708
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
709
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
710
1
                        ..Default::default()
711
1
                    },
712
1
                    ColumnStatistics {
713
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
714
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(50))),
715
1
                        ..Default::default()
716
1
                    },
717
1
                ],
718
1
                total_byte_size: Precision::Absent,
719
1
            },
720
1
            schema.clone(),
721
1
        ));
722
1
723
1
        // WHERE a <= 25
724
1
        let a_lte_25: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
725
1
            binary(col("a", &schema)
?0
, Operator::LtEq, lit(25i32), &schema)
?0
,
726
1
            input,
727
1
        )
?0
);
728
1
729
1
        // WHERE b > 45
730
1
        let b_gt_5: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
731
1
            binary(col("b", &schema)
?0
, Operator::Gt, lit(45i32), &schema)
?0
,
732
1
            a_lte_25,
733
1
        )
?0
);
734
1
735
1
        // WHERE a >= 10
736
1
        let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new(
737
1
            binary(col("a", &schema)
?0
, Operator::GtEq, lit(10i32), &schema)
?0
,
738
1
            b_gt_5,
739
1
        )
?0
);
740
1
        let statistics = filter.statistics()
?0
;
741
1
        // On a uniform distribution, only fifteen rows will satisfy the
742
1
        // filter that 'a' proposed (a >= 10 AND a <= 25) (15/100) and only
743
1
        // 5 rows will satisfy the filter that 'b' proposed (b > 45) (5/50).
744
1
        //
745
1
        // Which would result with a selectivity of  '15/100 * 5/50' or 0.015
746
1
        // and that means about %1.5 of the all rows (rounded up to 2 rows).
747
1
        assert_eq!(statistics.num_rows, Precision::Inexact(2));
748
1
        assert_eq!(
749
1
            statistics.column_statistics,
750
1
            vec![
751
1
                ColumnStatistics {
752
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
753
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(25))),
754
1
                    ..Default::default()
755
1
                },
756
1
                ColumnStatistics {
757
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(46))),
758
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(50))),
759
1
                    ..Default::default()
760
1
                }
761
1
            ]
762
1
        );
763
1
764
1
        Ok(())
765
1
    }
766
767
    #[tokio::test]
768
1
    async fn test_filter_statistics_when_input_stats_missing() -> Result<()> {
769
1
        // Table:
770
1
        //      a: min=???, max=??? (missing)
771
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
772
1
        let input = Arc::new(StatisticsExec::new(
773
1
            Statistics::new_unknown(&schema),
774
1
            schema.clone(),
775
1
        ));
776
1
777
1
        // a <= 25
778
1
        let predicate: Arc<dyn PhysicalExpr> =
779
1
            binary(col("a", &schema)
?0
, Operator::LtEq, lit(25i32), &schema)
?0
;
780
1
781
1
        // WHERE a <= 25
782
1
        let filter: Arc<dyn ExecutionPlan> =
783
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
784
1
785
1
        let statistics = filter.statistics()
?0
;
786
1
        assert_eq!(statistics.num_rows, Precision::Absent);
787
1
788
1
        Ok(())
789
1
    }
790
791
    #[tokio::test]
792
1
    async fn test_filter_statistics_multiple_columns() -> Result<()> {
793
1
        // Table:
794
1
        //      a: min=1, max=100
795
1
        //      b: min=1, max=3
796
1
        //      c: min=1000.0  max=1100.0
797
1
        let schema = Schema::new(vec![
798
1
            Field::new("a", DataType::Int32, false),
799
1
            Field::new("b", DataType::Int32, false),
800
1
            Field::new("c", DataType::Float32, false),
801
1
        ]);
802
1
        let input = Arc::new(StatisticsExec::new(
803
1
            Statistics {
804
1
                num_rows: Precision::Inexact(1000),
805
1
                total_byte_size: Precision::Inexact(4000),
806
1
                column_statistics: vec![
807
1
                    ColumnStatistics {
808
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
809
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
810
1
                        ..Default::default()
811
1
                    },
812
1
                    ColumnStatistics {
813
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
814
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
815
1
                        ..Default::default()
816
1
                    },
817
1
                    ColumnStatistics {
818
1
                        min_value: Precision::Inexact(ScalarValue::Float32(Some(1000.0))),
819
1
                        max_value: Precision::Inexact(ScalarValue::Float32(Some(1100.0))),
820
1
                        ..Default::default()
821
1
                    },
822
1
                ],
823
1
            },
824
1
            schema,
825
1
        ));
826
1
        // WHERE a<=53 AND (b=3 AND (c<=1075.0 AND a>b))
827
1
        let predicate = Arc::new(BinaryExpr::new(
828
1
            Arc::new(BinaryExpr::new(
829
1
                Arc::new(Column::new("a", 0)),
830
1
                Operator::LtEq,
831
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(53)))),
832
1
            )),
833
1
            Operator::And,
834
1
            Arc::new(BinaryExpr::new(
835
1
                Arc::new(BinaryExpr::new(
836
1
                    Arc::new(Column::new("b", 1)),
837
1
                    Operator::Eq,
838
1
                    Arc::new(Literal::new(ScalarValue::Int32(Some(3)))),
839
1
                )),
840
1
                Operator::And,
841
1
                Arc::new(BinaryExpr::new(
842
1
                    Arc::new(BinaryExpr::new(
843
1
                        Arc::new(Column::new("c", 2)),
844
1
                        Operator::LtEq,
845
1
                        Arc::new(Literal::new(ScalarValue::Float32(Some(1075.0)))),
846
1
                    )),
847
1
                    Operator::And,
848
1
                    Arc::new(BinaryExpr::new(
849
1
                        Arc::new(Column::new("a", 0)),
850
1
                        Operator::Gt,
851
1
                        Arc::new(Column::new("b", 1)),
852
1
                    )),
853
1
                )),
854
1
            )),
855
1
        ));
856
1
        let filter: Arc<dyn ExecutionPlan> =
857
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
858
1
        let statistics = filter.statistics()
?0
;
859
1
        // 0.5 (from a) * 0.333333... (from b) * 0.798387... (from c) ≈ 0.1330...
860
1
        // num_rows after ceil => 133.0... => 134
861
1
        // total_byte_size after ceil => 532.0... => 533
862
1
        assert_eq!(statistics.num_rows, Precision::Inexact(134));
863
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(533));
864
1
        let exp_col_stats = vec![
865
1
            ColumnStatistics {
866
1
                min_value: Precision::Inexact(ScalarValue::Int32(Some(4))),
867
1
                max_value: Precision::Inexact(ScalarValue::Int32(Some(53))),
868
1
                ..Default::default()
869
1
            },
870
1
            ColumnStatistics {
871
1
                min_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
872
1
                max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
873
1
                ..Default::default()
874
1
            },
875
1
            ColumnStatistics {
876
1
                min_value: Precision::Inexact(ScalarValue::Float32(Some(1000.0))),
877
1
                max_value: Precision::Inexact(ScalarValue::Float32(Some(1075.0))),
878
1
                ..Default::default()
879
1
            },
880
1
        ];
881
1
        let _ = exp_col_stats
882
1
            .into_iter()
883
1
            .zip(statistics.column_statistics)
884
1
            .map(|(expected, actual)| 
{0
885
1
                if let Some(
val0
) =
actual.min_value.get_value()0
{
886
1
                    if 
val.data_type().is_floating()0
{
887
1
                        // Windows rounds arithmetic operation results differently for floating point numbers.
888
1
                        // Therefore, we check if the actual values are in an epsilon range.
889
1
                        let actual_min = actual.min_value.get_value().unwrap();
890
0
                        let actual_max = actual.max_value.get_value().unwrap();
891
0
                        let expected_min = expected.min_value.get_value().unwrap();
892
0
                        let expected_max = expected.max_value.get_value().unwrap();
893
0
                        let eps = ScalarValue::Float32(Some(1e-6));
894
0
895
0
                        assert!(actual_min.sub(expected_min).unwrap() < eps);
896
1
                        
assert!(actual_min.sub(expected_min).unwrap() < eps)0
;
897
1
898
1
                        
assert!(actual_max.sub(expected_max).unwrap() < eps)0
;
899
1
                        
assert!(actual_max.sub(expected_max).unwrap() < eps)0
;
900
1
                    } else {
901
1
                        
assert_eq!(actual, expected)0
;
902
1
                    }
903
1
                } else {
904
1
                    
assert_eq!(actual, expected)0
;
905
1
                }
906
1
            
}0
);
907
1
908
1
        Ok(())
909
1
    }
910
911
    #[tokio::test]
912
1
    async fn test_filter_statistics_full_selective() -> Result<()> {
913
1
        // Table:
914
1
        //      a: min=1, max=100
915
1
        //      b: min=1, max=3
916
1
        let schema = Schema::new(vec![
917
1
            Field::new("a", DataType::Int32, false),
918
1
            Field::new("b", DataType::Int32, false),
919
1
        ]);
920
1
        let input = Arc::new(StatisticsExec::new(
921
1
            Statistics {
922
1
                num_rows: Precision::Inexact(1000),
923
1
                total_byte_size: Precision::Inexact(4000),
924
1
                column_statistics: vec![
925
1
                    ColumnStatistics {
926
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
927
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
928
1
                        ..Default::default()
929
1
                    },
930
1
                    ColumnStatistics {
931
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
932
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
933
1
                        ..Default::default()
934
1
                    },
935
1
                ],
936
1
            },
937
1
            schema,
938
1
        ));
939
1
        // WHERE a<200 AND 1<=b
940
1
        let predicate = Arc::new(BinaryExpr::new(
941
1
            Arc::new(BinaryExpr::new(
942
1
                Arc::new(Column::new("a", 0)),
943
1
                Operator::Lt,
944
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(200)))),
945
1
            )),
946
1
            Operator::And,
947
1
            Arc::new(BinaryExpr::new(
948
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
949
1
                Operator::LtEq,
950
1
                Arc::new(Column::new("b", 1)),
951
1
            )),
952
1
        ));
953
1
        // Since filter predicate passes all entries, statistics after filter shouldn't change.
954
1
        let expected = input.statistics()
?0
.column_statistics;
955
1
        let filter: Arc<dyn ExecutionPlan> =
956
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
957
1
        let statistics = filter.statistics()
?0
;
958
1
959
1
        assert_eq!(statistics.num_rows, Precision::Inexact(1000));
960
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(4000));
961
1
        assert_eq!(statistics.column_statistics, expected);
962
1
963
1
        Ok(())
964
1
    }
965
966
    #[tokio::test]
967
1
    async fn test_filter_statistics_zero_selective() -> Result<()> {
968
1
        // Table:
969
1
        //      a: min=1, max=100
970
1
        //      b: min=1, max=3
971
1
        let schema = Schema::new(vec![
972
1
            Field::new("a", DataType::Int32, false),
973
1
            Field::new("b", DataType::Int32, false),
974
1
        ]);
975
1
        let input = Arc::new(StatisticsExec::new(
976
1
            Statistics {
977
1
                num_rows: Precision::Inexact(1000),
978
1
                total_byte_size: Precision::Inexact(4000),
979
1
                column_statistics: vec![
980
1
                    ColumnStatistics {
981
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
982
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
983
1
                        ..Default::default()
984
1
                    },
985
1
                    ColumnStatistics {
986
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
987
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
988
1
                        ..Default::default()
989
1
                    },
990
1
                ],
991
1
            },
992
1
            schema,
993
1
        ));
994
1
        // WHERE a>200 AND 1<=b
995
1
        let predicate = Arc::new(BinaryExpr::new(
996
1
            Arc::new(BinaryExpr::new(
997
1
                Arc::new(Column::new("a", 0)),
998
1
                Operator::Gt,
999
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(200)))),
1000
1
            )),
1001
1
            Operator::And,
1002
1
            Arc::new(BinaryExpr::new(
1003
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(1)))),
1004
1
                Operator::LtEq,
1005
1
                Arc::new(Column::new("b", 1)),
1006
1
            )),
1007
1
        ));
1008
1
        let filter: Arc<dyn ExecutionPlan> =
1009
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
1010
1
        let statistics = filter.statistics()
?0
;
1011
1
1012
1
        assert_eq!(statistics.num_rows, Precision::Inexact(0));
1013
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(0));
1014
1
        assert_eq!(
1015
1
            statistics.column_statistics,
1016
1
            vec![
1017
1
                ColumnStatistics {
1018
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1019
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
1020
1
                    ..Default::default()
1021
1
                },
1022
1
                ColumnStatistics {
1023
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1024
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(3))),
1025
1
                    ..Default::default()
1026
1
                },
1027
1
            ]
1028
1
        );
1029
1
1030
1
        Ok(())
1031
1
    }
1032
1033
    #[tokio::test]
1034
1
    async fn test_filter_statistics_more_inputs() -> Result<()> {
1035
1
        let schema = Schema::new(vec![
1036
1
            Field::new("a", DataType::Int32, false),
1037
1
            Field::new("b", DataType::Int32, false),
1038
1
        ]);
1039
1
        let input = Arc::new(StatisticsExec::new(
1040
1
            Statistics {
1041
1
                num_rows: Precision::Inexact(1000),
1042
1
                total_byte_size: Precision::Inexact(4000),
1043
1
                column_statistics: vec![
1044
1
                    ColumnStatistics {
1045
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1046
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
1047
1
                        ..Default::default()
1048
1
                    },
1049
1
                    ColumnStatistics {
1050
1
                        min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1051
1
                        max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
1052
1
                        ..Default::default()
1053
1
                    },
1054
1
                ],
1055
1
            },
1056
1
            schema,
1057
1
        ));
1058
1
        // WHERE a<50
1059
1
        let predicate = Arc::new(BinaryExpr::new(
1060
1
            Arc::new(Column::new("a", 0)),
1061
1
            Operator::Lt,
1062
1
            Arc::new(Literal::new(ScalarValue::Int32(Some(50)))),
1063
1
        ));
1064
1
        let filter: Arc<dyn ExecutionPlan> =
1065
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
1066
1
        let statistics = filter.statistics()
?0
;
1067
1
1068
1
        assert_eq!(statistics.num_rows, Precision::Inexact(490));
1069
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(1960));
1070
1
        assert_eq!(
1071
1
            statistics.column_statistics,
1072
1
            vec![
1073
1
                ColumnStatistics {
1074
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1075
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(49))),
1076
1
                    ..Default::default()
1077
1
                },
1078
1
                ColumnStatistics {
1079
1
                    min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
1080
1
                    max_value: Precision::Inexact(ScalarValue::Int32(Some(100))),
1081
1
                    ..Default::default()
1082
1
                },
1083
1
            ]
1084
1
        );
1085
1
1086
1
        Ok(())
1087
1
    }
1088
1089
    #[tokio::test]
1090
1
    async fn test_empty_input_statistics() -> Result<()> {
1091
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
1092
1
        let input = Arc::new(StatisticsExec::new(
1093
1
            Statistics::new_unknown(&schema),
1094
1
            schema,
1095
1
        ));
1096
1
        // WHERE a <= 10 AND 0 <= a - 5
1097
1
        let predicate = Arc::new(BinaryExpr::new(
1098
1
            Arc::new(BinaryExpr::new(
1099
1
                Arc::new(Column::new("a", 0)),
1100
1
                Operator::LtEq,
1101
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
1102
1
            )),
1103
1
            Operator::And,
1104
1
            Arc::new(BinaryExpr::new(
1105
1
                Arc::new(Literal::new(ScalarValue::Int32(Some(0)))),
1106
1
                Operator::LtEq,
1107
1
                Arc::new(BinaryExpr::new(
1108
1
                    Arc::new(Column::new("a", 0)),
1109
1
                    Operator::Minus,
1110
1
                    Arc::new(Literal::new(ScalarValue::Int32(Some(5)))),
1111
1
                )),
1112
1
            )),
1113
1
        ));
1114
1
        let filter: Arc<dyn ExecutionPlan> =
1115
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
1116
1
        let filter_statistics = filter.statistics()
?0
;
1117
1
1118
1
        let expected_filter_statistics = Statistics {
1119
1
            num_rows: Precision::Absent,
1120
1
            total_byte_size: Precision::Absent,
1121
1
            column_statistics: vec![ColumnStatistics {
1122
1
                null_count: Precision::Absent,
1123
1
                min_value: Precision::Inexact(ScalarValue::Int32(Some(5))),
1124
1
                max_value: Precision::Inexact(ScalarValue::Int32(Some(10))),
1125
1
                distinct_count: Precision::Absent,
1126
1
            }],
1127
1
        };
1128
1
1129
1
        assert_eq!(filter_statistics, expected_filter_statistics);
1130
1
1131
1
        Ok(())
1132
1
    }
1133
1134
    #[tokio::test]
1135
1
    async fn test_statistics_with_constant_column() -> Result<()> {
1136
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
1137
1
        let input = Arc::new(StatisticsExec::new(
1138
1
            Statistics::new_unknown(&schema),
1139
1
            schema,
1140
1
        ));
1141
1
        // WHERE a = 10
1142
1
        let predicate = Arc::new(BinaryExpr::new(
1143
1
            Arc::new(Column::new("a", 0)),
1144
1
            Operator::Eq,
1145
1
            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
1146
1
        ));
1147
1
        let filter: Arc<dyn ExecutionPlan> =
1148
1
            Arc::new(FilterExec::try_new(predicate, input)
?0
);
1149
1
        let filter_statistics = filter.statistics()
?0
;
1150
1
        // First column is "a", and it is a column with only one value after the filter.
1151
1
        assert!(filter_statistics.column_statistics[0].is_singleton());
1152
1
1153
1
        Ok(())
1154
1
    }
1155
1156
    #[tokio::test]
1157
1
    async fn test_validation_filter_selectivity() -> Result<()> {
1158
1
        let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
1159
1
        let input = Arc::new(StatisticsExec::new(
1160
1
            Statistics::new_unknown(&schema),
1161
1
            schema,
1162
1
        ));
1163
1
        // WHERE a = 10
1164
1
        let predicate = Arc::new(BinaryExpr::new(
1165
1
            Arc::new(Column::new("a", 0)),
1166
1
            Operator::Eq,
1167
1
            Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
1168
1
        ));
1169
1
        let filter = FilterExec::try_new(predicate, input)
?0
;
1170
1
        assert!(filter.with_default_selectivity(120).is_err());
1171
1
        Ok(())
1172
1
    }
1173
1174
    #[tokio::test]
1175
1
    async fn test_custom_filter_selectivity() -> Result<()> {
1176
1
        // Need a decimal to trigger inexact selectivity
1177
1
        let schema =
1178
1
            Schema::new(vec![Field::new("a", DataType::Decimal128(2, 3), false)]);
1179
1
        let input = Arc::new(StatisticsExec::new(
1180
1
            Statistics {
1181
1
                num_rows: Precision::Inexact(1000),
1182
1
                total_byte_size: Precision::Inexact(4000),
1183
1
                column_statistics: vec![ColumnStatistics {
1184
1
                    ..Default::default()
1185
1
                }],
1186
1
            },
1187
1
            schema,
1188
1
        ));
1189
1
        // WHERE a = 10
1190
1
        let predicate = Arc::new(BinaryExpr::new(
1191
1
            Arc::new(Column::new("a", 0)),
1192
1
            Operator::Eq,
1193
1
            Arc::new(Literal::new(ScalarValue::Decimal128(Some(10), 10, 10))),
1194
1
        ));
1195
1
        let filter = FilterExec::try_new(predicate, input)
?0
;
1196
1
        let statistics = filter.statistics()
?0
;
1197
1
        assert_eq!(statistics.num_rows, Precision::Inexact(200));
1198
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(800));
1199
1
        let filter = filter.with_default_selectivity(40)
?0
;
1200
1
        let statistics = filter.statistics()
?0
;
1201
1
        assert_eq!(statistics.num_rows, Precision::Inexact(400));
1202
1
        assert_eq!(statistics.total_byte_size, Precision::Inexact(1600));
1203
1
        Ok(())
1204
1
    }
1205
1206
    #[test]
1207
1
    fn test_equivalence_properties_union_type() -> Result<()> {
1208
1
        let union_type = DataType::Union(
1209
1
            UnionFields::new(
1210
1
                vec![0, 1],
1211
1
                vec![
1212
1
                    Field::new("f1", DataType::Int32, true),
1213
1
                    Field::new("f2", DataType::Utf8, true),
1214
1
                ],
1215
1
            ),
1216
1
            UnionMode::Sparse,
1217
1
        );
1218
1
1219
1
        let schema = Arc::new(Schema::new(vec![
1220
1
            Field::new("c1", DataType::Int32, true),
1221
1
            Field::new("c2", union_type, true),
1222
1
        ]));
1223
1224
1
        let exec = FilterExec::try_new(
1225
            binary(
1226
1
                binary(col("c1", &schema)
?0
, Operator::GtEq, lit(1i32), &schema)
?0
,
1227
1
                Operator::And,
1228
1
                binary(col("c1", &schema)
?0
, Operator::LtEq, lit(4i32), &schema)
?0
,
1229
1
                &schema,
1230
0
            )?,
1231
1
            Arc::new(EmptyExec::new(Arc::clone(&schema))),
1232
0
        )?;
1233
1234
1
        exec.statistics().unwrap();
1235
1
1236
1
        Ok(())
1237
1
    }
1238
}