/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/filter.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::any::Any; |
19 | | use std::pin::Pin; |
20 | | use std::sync::Arc; |
21 | | use std::task::{ready, Context, Poll}; |
22 | | |
23 | | use super::{ |
24 | | ColumnStatistics, DisplayAs, ExecutionPlanProperties, PlanProperties, |
25 | | RecordBatchStream, SendableRecordBatchStream, Statistics, |
26 | | }; |
27 | | use crate::common::can_project; |
28 | | use crate::{ |
29 | | metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, |
30 | | DisplayFormatType, ExecutionPlan, |
31 | | }; |
32 | | |
33 | | use arrow::compute::filter_record_batch; |
34 | | use arrow::datatypes::{DataType, SchemaRef}; |
35 | | use arrow::record_batch::RecordBatch; |
36 | | use datafusion_common::cast::as_boolean_array; |
37 | | use datafusion_common::stats::Precision; |
38 | | use datafusion_common::{ |
39 | | internal_err, plan_err, project_schema, DataFusionError, Result, |
40 | | }; |
41 | | use datafusion_execution::TaskContext; |
42 | | use datafusion_expr::Operator; |
43 | | use datafusion_physical_expr::equivalence::ProjectionMapping; |
44 | | use datafusion_physical_expr::expressions::BinaryExpr; |
45 | | use datafusion_physical_expr::intervals::utils::check_support; |
46 | | use datafusion_physical_expr::utils::collect_columns; |
47 | | use datafusion_physical_expr::{ |
48 | | analyze, split_conjunction, AnalysisContext, ConstExpr, ExprBoundaries, PhysicalExpr, |
49 | | }; |
50 | | |
51 | | use futures::stream::{Stream, StreamExt}; |
52 | | use log::trace; |
53 | | |
54 | | /// FilterExec evaluates a boolean predicate against all input batches to determine which rows to |
55 | | /// include in its output batches. |
56 | | #[derive(Debug)] |
57 | | pub struct FilterExec { |
58 | | /// The expression to filter on. This expression must evaluate to a boolean value. |
59 | | predicate: Arc<dyn PhysicalExpr>, |
60 | | /// The input plan |
61 | | input: Arc<dyn ExecutionPlan>, |
62 | | /// Execution metrics |
63 | | metrics: ExecutionPlanMetricsSet, |
64 | | /// Selectivity for statistics. 0 = no rows, 100 = all rows |
65 | | default_selectivity: u8, |
66 | | /// Properties equivalence properties, partitioning, etc. |
67 | | cache: PlanProperties, |
68 | | /// The projection indices of the columns in the output schema of join |
69 | | projection: Option<Vec<usize>>, |
70 | | } |
71 | | |
72 | | impl FilterExec { |
73 | | /// Create a FilterExec on an input |
74 | 16 | pub fn try_new( |
75 | 16 | predicate: Arc<dyn PhysicalExpr>, |
76 | 16 | input: Arc<dyn ExecutionPlan>, |
77 | 16 | ) -> Result<Self> { |
78 | 16 | match predicate.data_type(input.schema().as_ref())?0 { |
79 | | DataType::Boolean => { |
80 | 16 | let default_selectivity = 20; |
81 | 16 | let cache = Self::compute_properties( |
82 | 16 | &input, |
83 | 16 | &predicate, |
84 | 16 | default_selectivity, |
85 | 16 | None, |
86 | 16 | )?0 ; |
87 | 16 | Ok(Self { |
88 | 16 | predicate, |
89 | 16 | input: Arc::clone(&input), |
90 | 16 | metrics: ExecutionPlanMetricsSet::new(), |
91 | 16 | default_selectivity, |
92 | 16 | cache, |
93 | 16 | projection: None, |
94 | 16 | }) |
95 | | } |
96 | 0 | other => { |
97 | 0 | plan_err!("Filter predicate must return BOOLEAN values, got {other:?}") |
98 | | } |
99 | | } |
100 | 16 | } |
101 | | |
102 | 2 | pub fn with_default_selectivity( |
103 | 2 | mut self, |
104 | 2 | default_selectivity: u8, |
105 | 2 | ) -> Result<Self, DataFusionError> { |
106 | 2 | if default_selectivity > 100 { |
107 | 1 | return plan_err!( |
108 | 1 | "Default filter selectivity value needs to be less than or equal to 100" |
109 | 1 | ); |
110 | 1 | } |
111 | 1 | self.default_selectivity = default_selectivity; |
112 | 1 | Ok(self) |
113 | 2 | } |
114 | | |
115 | | /// Return new instance of [FilterExec] with the given projection. |
116 | 0 | pub fn with_projection(&self, projection: Option<Vec<usize>>) -> Result<Self> { |
117 | 0 | // check if the projection is valid |
118 | 0 | can_project(&self.schema(), projection.as_ref())?; |
119 | | |
120 | 0 | let projection = match projection { |
121 | 0 | Some(projection) => match &self.projection { |
122 | 0 | Some(p) => Some(projection.iter().map(|i| p[*i]).collect()), |
123 | 0 | None => Some(projection), |
124 | | }, |
125 | 0 | None => None, |
126 | | }; |
127 | | |
128 | 0 | let cache = Self::compute_properties( |
129 | 0 | &self.input, |
130 | 0 | &self.predicate, |
131 | 0 | self.default_selectivity, |
132 | 0 | projection.as_ref(), |
133 | 0 | )?; |
134 | 0 | Ok(Self { |
135 | 0 | predicate: Arc::clone(&self.predicate), |
136 | 0 | input: Arc::clone(&self.input), |
137 | 0 | metrics: self.metrics.clone(), |
138 | 0 | default_selectivity: self.default_selectivity, |
139 | 0 | cache, |
140 | 0 | projection, |
141 | 0 | }) |
142 | 0 | } |
143 | | |
144 | | /// The expression to filter on. This expression must evaluate to a boolean value. |
145 | 20 | pub fn predicate(&self) -> &Arc<dyn PhysicalExpr> { |
146 | 20 | &self.predicate |
147 | 20 | } |
148 | | |
149 | | /// The input plan |
150 | 0 | pub fn input(&self) -> &Arc<dyn ExecutionPlan> { |
151 | 0 | &self.input |
152 | 0 | } |
153 | | |
154 | | /// The default selectivity |
155 | 0 | pub fn default_selectivity(&self) -> u8 { |
156 | 0 | self.default_selectivity |
157 | 0 | } |
158 | | |
159 | | /// projection |
160 | 0 | pub fn projection(&self) -> Option<&Vec<usize>> { |
161 | 0 | self.projection.as_ref() |
162 | 0 | } |
163 | | |
164 | | /// Calculates `Statistics` for `FilterExec`, by applying selectivity (either default, or estimated) to input statistics. |
165 | 36 | fn statistics_helper( |
166 | 36 | input: &Arc<dyn ExecutionPlan>, |
167 | 36 | predicate: &Arc<dyn PhysicalExpr>, |
168 | 36 | default_selectivity: u8, |
169 | 36 | ) -> Result<Statistics> { |
170 | 36 | let input_stats = input.statistics()?0 ; |
171 | 36 | let schema = input.schema(); |
172 | 36 | if !check_support(predicate, &schema) { |
173 | 3 | let selectivity = default_selectivity as f64 / 100.0; |
174 | 3 | let mut stats = input_stats.to_inexact(); |
175 | 3 | stats.num_rows = stats.num_rows.with_estimated_selectivity(selectivity); |
176 | 3 | stats.total_byte_size = stats |
177 | 3 | .total_byte_size |
178 | 3 | .with_estimated_selectivity(selectivity); |
179 | 3 | return Ok(stats); |
180 | 33 | } |
181 | 33 | |
182 | 33 | let num_rows = input_stats.num_rows; |
183 | 33 | let total_byte_size = input_stats.total_byte_size; |
184 | 33 | let input_analysis_ctx = AnalysisContext::try_from_statistics( |
185 | 33 | &input.schema(), |
186 | 33 | &input_stats.column_statistics, |
187 | 33 | )?0 ; |
188 | | |
189 | 33 | let analysis_ctx = analyze(predicate, input_analysis_ctx, &schema)?0 ; |
190 | | |
191 | | // Estimate (inexact) selectivity of predicate |
192 | 33 | let selectivity = analysis_ctx.selectivity.unwrap_or(1.0); |
193 | 33 | let num_rows = num_rows.with_estimated_selectivity(selectivity); |
194 | 33 | let total_byte_size = total_byte_size.with_estimated_selectivity(selectivity); |
195 | 33 | |
196 | 33 | let column_statistics = collect_new_statistics( |
197 | 33 | &input_stats.column_statistics, |
198 | 33 | analysis_ctx.boundaries, |
199 | 33 | ); |
200 | 33 | Ok(Statistics { |
201 | 33 | num_rows, |
202 | 33 | total_byte_size, |
203 | 33 | column_statistics, |
204 | 33 | }) |
205 | 36 | } |
206 | | |
207 | 16 | fn extend_constants( |
208 | 16 | input: &Arc<dyn ExecutionPlan>, |
209 | 16 | predicate: &Arc<dyn PhysicalExpr>, |
210 | 16 | ) -> Vec<ConstExpr> { |
211 | 16 | let mut res_constants = Vec::new(); |
212 | 16 | let input_eqs = input.equivalence_properties(); |
213 | 16 | |
214 | 16 | let conjunctions = split_conjunction(predicate); |
215 | 39 | for conjunction23 in conjunctions { |
216 | 23 | if let Some(binary) = conjunction.as_any().downcast_ref::<BinaryExpr>() { |
217 | 23 | if binary.op() == &Operator::Eq { |
218 | | // Filter evaluates to single value for all partitions |
219 | 4 | if input_eqs.is_expr_constant(binary.left()) { |
220 | 0 | res_constants.push( |
221 | 0 | ConstExpr::from(binary.right()).with_across_partitions(true), |
222 | 0 | ) |
223 | 4 | } else if input_eqs.is_expr_constant(binary.right()) { |
224 | 4 | res_constants.push( |
225 | 4 | ConstExpr::from(binary.left()).with_across_partitions(true), |
226 | 4 | ) |
227 | 0 | } |
228 | 19 | } |
229 | 0 | } |
230 | | } |
231 | 16 | res_constants |
232 | 16 | } |
233 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
234 | 16 | fn compute_properties( |
235 | 16 | input: &Arc<dyn ExecutionPlan>, |
236 | 16 | predicate: &Arc<dyn PhysicalExpr>, |
237 | 16 | default_selectivity: u8, |
238 | 16 | projection: Option<&Vec<usize>>, |
239 | 16 | ) -> Result<PlanProperties> { |
240 | | // Combine the equal predicates with the input equivalence properties |
241 | | // to construct the equivalence properties: |
242 | 16 | let stats = Self::statistics_helper(input, predicate, default_selectivity)?0 ; |
243 | 16 | let mut eq_properties = input.equivalence_properties().clone(); |
244 | 16 | let (equal_pairs, _) = collect_columns_from_predicate(predicate); |
245 | 20 | for (lhs, rhs4 ) in equal_pairs { |
246 | 4 | eq_properties.add_equal_conditions(lhs, rhs)?0 |
247 | | } |
248 | | // Add the columns that have only one viable value (singleton) after |
249 | | // filtering to constants. |
250 | 16 | let constants = collect_columns(predicate) |
251 | 16 | .into_iter() |
252 | 20 | .filter(|column| stats.column_statistics[column.index()].is_singleton()) |
253 | 16 | .map(|column| { |
254 | 3 | let expr = Arc::new(column) as _; |
255 | 3 | ConstExpr::new(expr).with_across_partitions(true) |
256 | 16 | }); |
257 | 16 | // this is for statistics |
258 | 16 | eq_properties = eq_properties.with_constants(constants); |
259 | 16 | // this is for logical constant (for example: a = '1', then a could be marked as a constant) |
260 | 16 | // to do: how to deal with multiple situation to represent = (for example c1 between 0 and 0) |
261 | 16 | eq_properties = |
262 | 16 | eq_properties.with_constants(Self::extend_constants(input, predicate)); |
263 | 16 | |
264 | 16 | let mut output_partitioning = input.output_partitioning().clone(); |
265 | | // If contains projection, update the PlanProperties. |
266 | 16 | if let Some(projection0 ) = projection { |
267 | 0 | let schema = eq_properties.schema(); |
268 | 0 | let projection_mapping = ProjectionMapping::from_indices(projection, schema)?; |
269 | 0 | let out_schema = project_schema(schema, Some(projection))?; |
270 | 0 | output_partitioning = |
271 | 0 | output_partitioning.project(&projection_mapping, &eq_properties); |
272 | 0 | eq_properties = eq_properties.project(&projection_mapping, out_schema); |
273 | 16 | } |
274 | 16 | Ok(PlanProperties::new( |
275 | 16 | eq_properties, |
276 | 16 | output_partitioning, |
277 | 16 | input.execution_mode(), |
278 | 16 | )) |
279 | 16 | } |
280 | | } |
281 | | |
282 | | impl DisplayAs for FilterExec { |
283 | 0 | fn fmt_as( |
284 | 0 | &self, |
285 | 0 | t: DisplayFormatType, |
286 | 0 | f: &mut std::fmt::Formatter, |
287 | 0 | ) -> std::fmt::Result { |
288 | 0 | match t { |
289 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
290 | 0 | let display_projections = if let Some(projection) = |
291 | 0 | self.projection.as_ref() |
292 | | { |
293 | 0 | format!( |
294 | 0 | ", projection=[{}]", |
295 | 0 | projection |
296 | 0 | .iter() |
297 | 0 | .map(|index| format!( |
298 | 0 | "{}@{}", |
299 | 0 | self.input.schema().fields().get(*index).unwrap().name(), |
300 | 0 | index |
301 | 0 | )) |
302 | 0 | .collect::<Vec<_>>() |
303 | 0 | .join(", ") |
304 | 0 | ) |
305 | | } else { |
306 | 0 | "".to_string() |
307 | | }; |
308 | 0 | write!(f, "FilterExec: {}{}", self.predicate, display_projections) |
309 | 0 | } |
310 | 0 | } |
311 | 0 | } |
312 | | } |
313 | | |
314 | | impl ExecutionPlan for FilterExec { |
315 | 0 | fn name(&self) -> &'static str { |
316 | 0 | "FilterExec" |
317 | 0 | } |
318 | | |
319 | | /// Return a reference to Any that can be used for downcasting |
320 | 0 | fn as_any(&self) -> &dyn Any { |
321 | 0 | self |
322 | 0 | } |
323 | | |
324 | 29 | fn properties(&self) -> &PlanProperties { |
325 | 29 | &self.cache |
326 | 29 | } |
327 | | |
328 | 0 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
329 | 0 | vec![&self.input] |
330 | 0 | } |
331 | | |
332 | 0 | fn maintains_input_order(&self) -> Vec<bool> { |
333 | 0 | // tell optimizer this operator doesn't reorder its input |
334 | 0 | vec![true] |
335 | 0 | } |
336 | | |
337 | 0 | fn with_new_children( |
338 | 0 | self: Arc<Self>, |
339 | 0 | mut children: Vec<Arc<dyn ExecutionPlan>>, |
340 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
341 | 0 | FilterExec::try_new(Arc::clone(&self.predicate), children.swap_remove(0)) |
342 | 0 | .and_then(|e| { |
343 | 0 | let selectivity = e.default_selectivity(); |
344 | 0 | e.with_default_selectivity(selectivity) |
345 | 0 | }) |
346 | 0 | .and_then(|e| e.with_projection(self.projection().cloned())) |
347 | 0 | .map(|e| Arc::new(e) as _) |
348 | 0 | } |
349 | | |
350 | 0 | fn execute( |
351 | 0 | &self, |
352 | 0 | partition: usize, |
353 | 0 | context: Arc<TaskContext>, |
354 | 0 | ) -> Result<SendableRecordBatchStream> { |
355 | 0 | trace!("Start FilterExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()); |
356 | 0 | let baseline_metrics = BaselineMetrics::new(&self.metrics, partition); |
357 | 0 | Ok(Box::pin(FilterExecStream { |
358 | 0 | schema: self.schema(), |
359 | 0 | predicate: Arc::clone(&self.predicate), |
360 | 0 | input: self.input.execute(partition, context)?, |
361 | 0 | baseline_metrics, |
362 | 0 | projection: self.projection.clone(), |
363 | | })) |
364 | 0 | } |
365 | | |
366 | 0 | fn metrics(&self) -> Option<MetricsSet> { |
367 | 0 | Some(self.metrics.clone_inner()) |
368 | 0 | } |
369 | | |
370 | | /// The output statistics of a filtering operation can be estimated if the |
371 | | /// predicate's selectivity value can be determined for the incoming data. |
372 | 20 | fn statistics(&self) -> Result<Statistics> { |
373 | 20 | Self::statistics_helper(&self.input, self.predicate(), self.default_selectivity) |
374 | 20 | } |
375 | | } |
376 | | |
377 | | /// This function ensures that all bounds in the `ExprBoundaries` vector are |
378 | | /// converted to closed bounds. If a lower/upper bound is initially open, it |
379 | | /// is adjusted by using the next/previous value for its data type to convert |
380 | | /// it into a closed bound. |
381 | 33 | fn collect_new_statistics( |
382 | 33 | input_column_stats: &[ColumnStatistics], |
383 | 33 | analysis_boundaries: Vec<ExprBoundaries>, |
384 | 33 | ) -> Vec<ColumnStatistics> { |
385 | 33 | analysis_boundaries |
386 | 33 | .into_iter() |
387 | 33 | .enumerate() |
388 | 33 | .map( |
389 | 33 | |( |
390 | | idx, |
391 | | ExprBoundaries { |
392 | | interval, |
393 | | distinct_count, |
394 | | .. |
395 | | }, |
396 | 54 | )| { |
397 | 54 | let (lower, upper) = interval.into_bounds(); |
398 | 54 | let (min_value, max_value) = if lower.eq(&upper) { |
399 | 7 | (Precision::Exact(lower), Precision::Exact(upper)) |
400 | | } else { |
401 | 47 | (Precision::Inexact(lower), Precision::Inexact(upper)) |
402 | | }; |
403 | 54 | ColumnStatistics { |
404 | 54 | null_count: input_column_stats[idx].null_count.to_inexact(), |
405 | 54 | max_value, |
406 | 54 | min_value, |
407 | 54 | distinct_count: distinct_count.to_inexact(), |
408 | 54 | } |
409 | 54 | }, |
410 | 33 | ) |
411 | 33 | .collect() |
412 | 33 | } |
413 | | |
414 | | /// The FilterExec streams wraps the input iterator and applies the predicate expression to |
415 | | /// determine which rows to include in its output batches |
416 | | struct FilterExecStream { |
417 | | /// Output schema after the projection |
418 | | schema: SchemaRef, |
419 | | /// The expression to filter on. This expression must evaluate to a boolean value. |
420 | | predicate: Arc<dyn PhysicalExpr>, |
421 | | /// The input partition to filter. |
422 | | input: SendableRecordBatchStream, |
423 | | /// runtime metrics recording |
424 | | baseline_metrics: BaselineMetrics, |
425 | | /// The projection indices of the columns in the input schema |
426 | | projection: Option<Vec<usize>>, |
427 | | } |
428 | | |
429 | 0 | pub fn batch_filter( |
430 | 0 | batch: &RecordBatch, |
431 | 0 | predicate: &Arc<dyn PhysicalExpr>, |
432 | 0 | ) -> Result<RecordBatch> { |
433 | 0 | filter_and_project(batch, predicate, None, &batch.schema()) |
434 | 0 | } |
435 | | |
436 | 0 | fn filter_and_project( |
437 | 0 | batch: &RecordBatch, |
438 | 0 | predicate: &Arc<dyn PhysicalExpr>, |
439 | 0 | projection: Option<&Vec<usize>>, |
440 | 0 | output_schema: &SchemaRef, |
441 | 0 | ) -> Result<RecordBatch> { |
442 | 0 | predicate |
443 | 0 | .evaluate(batch) |
444 | 0 | .and_then(|v| v.into_array(batch.num_rows())) |
445 | 0 | .and_then(|array| { |
446 | 0 | Ok(match (as_boolean_array(&array), projection) { |
447 | | // apply filter array to record batch |
448 | 0 | (Ok(filter_array), None) => filter_record_batch(batch, filter_array)?, |
449 | 0 | (Ok(filter_array), Some(projection)) => { |
450 | 0 | let projected_columns = projection |
451 | 0 | .iter() |
452 | 0 | .map(|i| Arc::clone(batch.column(*i))) |
453 | 0 | .collect(); |
454 | 0 | let projected_batch = RecordBatch::try_new( |
455 | 0 | Arc::clone(output_schema), |
456 | 0 | projected_columns, |
457 | 0 | )?; |
458 | 0 | filter_record_batch(&projected_batch, filter_array)? |
459 | | } |
460 | | (Err(_), _) => { |
461 | 0 | return internal_err!( |
462 | 0 | "Cannot create filter_array from non-boolean predicates" |
463 | 0 | ); |
464 | | } |
465 | | }) |
466 | 0 | }) |
467 | 0 | } |
468 | | |
469 | | impl Stream for FilterExecStream { |
470 | | type Item = Result<RecordBatch>; |
471 | | |
472 | 0 | fn poll_next( |
473 | 0 | mut self: Pin<&mut Self>, |
474 | 0 | cx: &mut Context<'_>, |
475 | 0 | ) -> Poll<Option<Self::Item>> { |
476 | | let poll; |
477 | | loop { |
478 | 0 | match ready!(self.input.poll_next_unpin(cx)) { |
479 | 0 | Some(Ok(batch)) => { |
480 | 0 | let timer = self.baseline_metrics.elapsed_compute().timer(); |
481 | 0 | let filtered_batch = filter_and_project( |
482 | 0 | &batch, |
483 | 0 | &self.predicate, |
484 | 0 | self.projection.as_ref(), |
485 | 0 | &self.schema, |
486 | 0 | )?; |
487 | 0 | timer.done(); |
488 | 0 | // skip entirely filtered batches |
489 | 0 | if filtered_batch.num_rows() == 0 { |
490 | 0 | continue; |
491 | 0 | } |
492 | 0 | poll = Poll::Ready(Some(Ok(filtered_batch))); |
493 | 0 | break; |
494 | | } |
495 | 0 | value => { |
496 | 0 | poll = Poll::Ready(value); |
497 | 0 | break; |
498 | | } |
499 | | } |
500 | | } |
501 | 0 | self.baseline_metrics.record_poll(poll) |
502 | 0 | } |
503 | | |
504 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
505 | 0 | // same number of record batches |
506 | 0 | self.input.size_hint() |
507 | 0 | } |
508 | | } |
509 | | |
510 | | impl RecordBatchStream for FilterExecStream { |
511 | 0 | fn schema(&self) -> SchemaRef { |
512 | 0 | Arc::clone(&self.schema) |
513 | 0 | } |
514 | | } |
515 | | |
516 | | /// Return the equals Column-Pairs and Non-equals Column-Pairs |
517 | 17 | fn collect_columns_from_predicate(predicate: &Arc<dyn PhysicalExpr>) -> EqualAndNonEqual { |
518 | 17 | let mut eq_predicate_columns = Vec::<PhysicalExprPairRef>::new(); |
519 | 17 | let mut ne_predicate_columns = Vec::<PhysicalExprPairRef>::new(); |
520 | 17 | |
521 | 17 | let predicates = split_conjunction(predicate); |
522 | 27 | predicates.into_iter().for_each(|p| { |
523 | 27 | if let Some(binary) = p.as_any().downcast_ref::<BinaryExpr>() { |
524 | 27 | match binary.op() { |
525 | | Operator::Eq => { |
526 | 6 | eq_predicate_columns.push((binary.left(), binary.right())) |
527 | | } |
528 | | Operator::NotEq => { |
529 | 1 | ne_predicate_columns.push((binary.left(), binary.right())) |
530 | | } |
531 | 20 | _ => {} |
532 | | } |
533 | 0 | } |
534 | 27 | }); |
535 | 17 | |
536 | 17 | (eq_predicate_columns, ne_predicate_columns) |
537 | 17 | } |
538 | | |
539 | | /// Pair of `Arc<dyn PhysicalExpr>`s |
540 | | pub type PhysicalExprPairRef<'a> = (&'a Arc<dyn PhysicalExpr>, &'a Arc<dyn PhysicalExpr>); |
541 | | |
542 | | /// The equals Column-Pairs and Non-equals Column-Pairs in the Predicates |
543 | | pub type EqualAndNonEqual<'a> = |
544 | | (Vec<PhysicalExprPairRef<'a>>, Vec<PhysicalExprPairRef<'a>>); |
545 | | |
546 | | #[cfg(test)] |
547 | | mod tests { |
548 | | use super::*; |
549 | | use crate::empty::EmptyExec; |
550 | | use crate::expressions::*; |
551 | | use crate::test; |
552 | | use crate::test::exec::StatisticsExec; |
553 | | |
554 | | use arrow::datatypes::{Field, Schema}; |
555 | | use arrow_schema::{UnionFields, UnionMode}; |
556 | | use datafusion_common::ScalarValue; |
557 | | |
558 | | #[tokio::test] |
559 | 1 | async fn collect_columns_predicates() -> Result<()> { |
560 | 1 | let schema = test::aggr_test_schema(); |
561 | 1 | let predicate: Arc<dyn PhysicalExpr> = binary( |
562 | 1 | binary( |
563 | 1 | binary(col("c2", &schema)?0 , Operator::GtEq, lit(1u32), &schema)?0 , |
564 | 1 | Operator::And, |
565 | 1 | binary(col("c2", &schema)?0 , Operator::Eq, lit(4u32), &schema)?0 , |
566 | 1 | &schema, |
567 | 1 | )?0 , |
568 | 1 | Operator::And, |
569 | 1 | binary( |
570 | 1 | binary( |
571 | 1 | col("c2", &schema)?0 , |
572 | 1 | Operator::Eq, |
573 | 1 | col("c9", &schema)?0 , |
574 | 1 | &schema, |
575 | 1 | )?0 , |
576 | 1 | Operator::And, |
577 | 1 | binary( |
578 | 1 | col("c1", &schema)?0 , |
579 | 1 | Operator::NotEq, |
580 | 1 | col("c13", &schema)?0 , |
581 | 1 | &schema, |
582 | 1 | )?0 , |
583 | 1 | &schema, |
584 | 1 | )?0 , |
585 | 1 | &schema, |
586 | 1 | )?0 ; |
587 | 1 | |
588 | 1 | let (equal_pairs, ne_pairs) = collect_columns_from_predicate(&predicate); |
589 | 1 | assert_eq!(2, equal_pairs.len()); |
590 | 1 | assert!(equal_pairs[0].0.eq(&col("c2", &schema)?0 )); |
591 | 1 | assert!(equal_pairs[0].1.eq(&lit(4u32))); |
592 | 1 | |
593 | 1 | assert!(equal_pairs[1].0.eq(&col("c2", &schema)?0 )); |
594 | 1 | assert!(equal_pairs[1].1.eq(&col("c9", &schema)?0 )); |
595 | 1 | |
596 | 1 | assert_eq!(1, ne_pairs.len()); |
597 | 1 | assert!(ne_pairs[0].0.eq(&col("c1", &schema)?0 )); |
598 | 1 | assert!(ne_pairs[0].1.eq(&col("c13", &schema)?0 )); |
599 | 1 | |
600 | 1 | Ok(()) |
601 | 1 | } |
602 | | |
603 | | #[tokio::test] |
604 | 1 | async fn test_filter_statistics_basic_expr() -> Result<()> { |
605 | 1 | // Table: |
606 | 1 | // a: min=1, max=100 |
607 | 1 | let bytes_per_row = 4; |
608 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
609 | 1 | let input = Arc::new(StatisticsExec::new( |
610 | 1 | Statistics { |
611 | 1 | num_rows: Precision::Inexact(100), |
612 | 1 | total_byte_size: Precision::Inexact(100 * bytes_per_row), |
613 | 1 | column_statistics: vec![ColumnStatistics { |
614 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
615 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
616 | 1 | ..Default::default() |
617 | 1 | }], |
618 | 1 | }, |
619 | 1 | schema.clone(), |
620 | 1 | )); |
621 | 1 | |
622 | 1 | // a <= 25 |
623 | 1 | let predicate: Arc<dyn PhysicalExpr> = |
624 | 1 | binary(col("a", &schema)?0 , Operator::LtEq, lit(25i32), &schema)?0 ; |
625 | 1 | |
626 | 1 | // WHERE a <= 25 |
627 | 1 | let filter: Arc<dyn ExecutionPlan> = |
628 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
629 | 1 | |
630 | 1 | let statistics = filter.statistics()?0 ; |
631 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(25)); |
632 | 1 | assert_eq!( |
633 | 1 | statistics.total_byte_size, |
634 | 1 | Precision::Inexact(25 * bytes_per_row) |
635 | 1 | ); |
636 | 1 | assert_eq!( |
637 | 1 | statistics.column_statistics, |
638 | 1 | vec![ColumnStatistics { |
639 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
640 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(25))), |
641 | 1 | ..Default::default() |
642 | 1 | }] |
643 | 1 | ); |
644 | 1 | |
645 | 1 | Ok(()) |
646 | 1 | } |
647 | | |
648 | | #[tokio::test] |
649 | 1 | async fn test_filter_statistics_column_level_nested() -> Result<()> { |
650 | 1 | // Table: |
651 | 1 | // a: min=1, max=100 |
652 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
653 | 1 | let input = Arc::new(StatisticsExec::new( |
654 | 1 | Statistics { |
655 | 1 | num_rows: Precision::Inexact(100), |
656 | 1 | column_statistics: vec![ColumnStatistics { |
657 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
658 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
659 | 1 | ..Default::default() |
660 | 1 | }], |
661 | 1 | total_byte_size: Precision::Absent, |
662 | 1 | }, |
663 | 1 | schema.clone(), |
664 | 1 | )); |
665 | 1 | |
666 | 1 | // WHERE a <= 25 |
667 | 1 | let sub_filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new( |
668 | 1 | binary(col("a", &schema)?0 , Operator::LtEq, lit(25i32), &schema)?0 , |
669 | 1 | input, |
670 | 1 | )?0 ); |
671 | 1 | |
672 | 1 | // Nested filters (two separate physical plans, instead of AND chain in the expr) |
673 | 1 | // WHERE a >= 10 |
674 | 1 | // WHERE a <= 25 |
675 | 1 | let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new( |
676 | 1 | binary(col("a", &schema)?0 , Operator::GtEq, lit(10i32), &schema)?0 , |
677 | 1 | sub_filter, |
678 | 1 | )?0 ); |
679 | 1 | |
680 | 1 | let statistics = filter.statistics()?0 ; |
681 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(16)); |
682 | 1 | assert_eq!( |
683 | 1 | statistics.column_statistics, |
684 | 1 | vec![ColumnStatistics { |
685 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(10))), |
686 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(25))), |
687 | 1 | ..Default::default() |
688 | 1 | }] |
689 | 1 | ); |
690 | 1 | |
691 | 1 | Ok(()) |
692 | 1 | } |
693 | | |
694 | | #[tokio::test] |
695 | 1 | async fn test_filter_statistics_column_level_nested_multiple() -> Result<()> { |
696 | 1 | // Table: |
697 | 1 | // a: min=1, max=100 |
698 | 1 | // b: min=1, max=50 |
699 | 1 | let schema = Schema::new(vec![ |
700 | 1 | Field::new("a", DataType::Int32, false), |
701 | 1 | Field::new("b", DataType::Int32, false), |
702 | 1 | ]); |
703 | 1 | let input = Arc::new(StatisticsExec::new( |
704 | 1 | Statistics { |
705 | 1 | num_rows: Precision::Inexact(100), |
706 | 1 | column_statistics: vec![ |
707 | 1 | ColumnStatistics { |
708 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
709 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
710 | 1 | ..Default::default() |
711 | 1 | }, |
712 | 1 | ColumnStatistics { |
713 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
714 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(50))), |
715 | 1 | ..Default::default() |
716 | 1 | }, |
717 | 1 | ], |
718 | 1 | total_byte_size: Precision::Absent, |
719 | 1 | }, |
720 | 1 | schema.clone(), |
721 | 1 | )); |
722 | 1 | |
723 | 1 | // WHERE a <= 25 |
724 | 1 | let a_lte_25: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new( |
725 | 1 | binary(col("a", &schema)?0 , Operator::LtEq, lit(25i32), &schema)?0 , |
726 | 1 | input, |
727 | 1 | )?0 ); |
728 | 1 | |
729 | 1 | // WHERE b > 45 |
730 | 1 | let b_gt_5: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new( |
731 | 1 | binary(col("b", &schema)?0 , Operator::Gt, lit(45i32), &schema)?0 , |
732 | 1 | a_lte_25, |
733 | 1 | )?0 ); |
734 | 1 | |
735 | 1 | // WHERE a >= 10 |
736 | 1 | let filter: Arc<dyn ExecutionPlan> = Arc::new(FilterExec::try_new( |
737 | 1 | binary(col("a", &schema)?0 , Operator::GtEq, lit(10i32), &schema)?0 , |
738 | 1 | b_gt_5, |
739 | 1 | )?0 ); |
740 | 1 | let statistics = filter.statistics()?0 ; |
741 | 1 | // On a uniform distribution, only fifteen rows will satisfy the |
742 | 1 | // filter that 'a' proposed (a >= 10 AND a <= 25) (15/100) and only |
743 | 1 | // 5 rows will satisfy the filter that 'b' proposed (b > 45) (5/50). |
744 | 1 | // |
745 | 1 | // Which would result with a selectivity of '15/100 * 5/50' or 0.015 |
746 | 1 | // and that means about %1.5 of the all rows (rounded up to 2 rows). |
747 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(2)); |
748 | 1 | assert_eq!( |
749 | 1 | statistics.column_statistics, |
750 | 1 | vec![ |
751 | 1 | ColumnStatistics { |
752 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(10))), |
753 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(25))), |
754 | 1 | ..Default::default() |
755 | 1 | }, |
756 | 1 | ColumnStatistics { |
757 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(46))), |
758 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(50))), |
759 | 1 | ..Default::default() |
760 | 1 | } |
761 | 1 | ] |
762 | 1 | ); |
763 | 1 | |
764 | 1 | Ok(()) |
765 | 1 | } |
766 | | |
767 | | #[tokio::test] |
768 | 1 | async fn test_filter_statistics_when_input_stats_missing() -> Result<()> { |
769 | 1 | // Table: |
770 | 1 | // a: min=???, max=??? (missing) |
771 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
772 | 1 | let input = Arc::new(StatisticsExec::new( |
773 | 1 | Statistics::new_unknown(&schema), |
774 | 1 | schema.clone(), |
775 | 1 | )); |
776 | 1 | |
777 | 1 | // a <= 25 |
778 | 1 | let predicate: Arc<dyn PhysicalExpr> = |
779 | 1 | binary(col("a", &schema)?0 , Operator::LtEq, lit(25i32), &schema)?0 ; |
780 | 1 | |
781 | 1 | // WHERE a <= 25 |
782 | 1 | let filter: Arc<dyn ExecutionPlan> = |
783 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
784 | 1 | |
785 | 1 | let statistics = filter.statistics()?0 ; |
786 | 1 | assert_eq!(statistics.num_rows, Precision::Absent); |
787 | 1 | |
788 | 1 | Ok(()) |
789 | 1 | } |
790 | | |
791 | | #[tokio::test] |
792 | 1 | async fn test_filter_statistics_multiple_columns() -> Result<()> { |
793 | 1 | // Table: |
794 | 1 | // a: min=1, max=100 |
795 | 1 | // b: min=1, max=3 |
796 | 1 | // c: min=1000.0 max=1100.0 |
797 | 1 | let schema = Schema::new(vec![ |
798 | 1 | Field::new("a", DataType::Int32, false), |
799 | 1 | Field::new("b", DataType::Int32, false), |
800 | 1 | Field::new("c", DataType::Float32, false), |
801 | 1 | ]); |
802 | 1 | let input = Arc::new(StatisticsExec::new( |
803 | 1 | Statistics { |
804 | 1 | num_rows: Precision::Inexact(1000), |
805 | 1 | total_byte_size: Precision::Inexact(4000), |
806 | 1 | column_statistics: vec![ |
807 | 1 | ColumnStatistics { |
808 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
809 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
810 | 1 | ..Default::default() |
811 | 1 | }, |
812 | 1 | ColumnStatistics { |
813 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
814 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
815 | 1 | ..Default::default() |
816 | 1 | }, |
817 | 1 | ColumnStatistics { |
818 | 1 | min_value: Precision::Inexact(ScalarValue::Float32(Some(1000.0))), |
819 | 1 | max_value: Precision::Inexact(ScalarValue::Float32(Some(1100.0))), |
820 | 1 | ..Default::default() |
821 | 1 | }, |
822 | 1 | ], |
823 | 1 | }, |
824 | 1 | schema, |
825 | 1 | )); |
826 | 1 | // WHERE a<=53 AND (b=3 AND (c<=1075.0 AND a>b)) |
827 | 1 | let predicate = Arc::new(BinaryExpr::new( |
828 | 1 | Arc::new(BinaryExpr::new( |
829 | 1 | Arc::new(Column::new("a", 0)), |
830 | 1 | Operator::LtEq, |
831 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(53)))), |
832 | 1 | )), |
833 | 1 | Operator::And, |
834 | 1 | Arc::new(BinaryExpr::new( |
835 | 1 | Arc::new(BinaryExpr::new( |
836 | 1 | Arc::new(Column::new("b", 1)), |
837 | 1 | Operator::Eq, |
838 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(3)))), |
839 | 1 | )), |
840 | 1 | Operator::And, |
841 | 1 | Arc::new(BinaryExpr::new( |
842 | 1 | Arc::new(BinaryExpr::new( |
843 | 1 | Arc::new(Column::new("c", 2)), |
844 | 1 | Operator::LtEq, |
845 | 1 | Arc::new(Literal::new(ScalarValue::Float32(Some(1075.0)))), |
846 | 1 | )), |
847 | 1 | Operator::And, |
848 | 1 | Arc::new(BinaryExpr::new( |
849 | 1 | Arc::new(Column::new("a", 0)), |
850 | 1 | Operator::Gt, |
851 | 1 | Arc::new(Column::new("b", 1)), |
852 | 1 | )), |
853 | 1 | )), |
854 | 1 | )), |
855 | 1 | )); |
856 | 1 | let filter: Arc<dyn ExecutionPlan> = |
857 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
858 | 1 | let statistics = filter.statistics()?0 ; |
859 | 1 | // 0.5 (from a) * 0.333333... (from b) * 0.798387... (from c) ≈ 0.1330... |
860 | 1 | // num_rows after ceil => 133.0... => 134 |
861 | 1 | // total_byte_size after ceil => 532.0... => 533 |
862 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(134)); |
863 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(533)); |
864 | 1 | let exp_col_stats = vec![ |
865 | 1 | ColumnStatistics { |
866 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(4))), |
867 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(53))), |
868 | 1 | ..Default::default() |
869 | 1 | }, |
870 | 1 | ColumnStatistics { |
871 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
872 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
873 | 1 | ..Default::default() |
874 | 1 | }, |
875 | 1 | ColumnStatistics { |
876 | 1 | min_value: Precision::Inexact(ScalarValue::Float32(Some(1000.0))), |
877 | 1 | max_value: Precision::Inexact(ScalarValue::Float32(Some(1075.0))), |
878 | 1 | ..Default::default() |
879 | 1 | }, |
880 | 1 | ]; |
881 | 1 | let _ = exp_col_stats |
882 | 1 | .into_iter() |
883 | 1 | .zip(statistics.column_statistics) |
884 | 1 | .map(|(expected, actual)| {0 |
885 | 1 | if let Some(val0 ) = actual.min_value.get_value()0 { |
886 | 1 | if val.data_type().is_floating()0 { |
887 | 1 | // Windows rounds arithmetic operation results differently for floating point numbers. |
888 | 1 | // Therefore, we check if the actual values are in an epsilon range. |
889 | 1 | let actual_min = actual.min_value.get_value().unwrap(); |
890 | 0 | let actual_max = actual.max_value.get_value().unwrap(); |
891 | 0 | let expected_min = expected.min_value.get_value().unwrap(); |
892 | 0 | let expected_max = expected.max_value.get_value().unwrap(); |
893 | 0 | let eps = ScalarValue::Float32(Some(1e-6)); |
894 | 0 |
|
895 | 0 | assert!(actual_min.sub(expected_min).unwrap() < eps); |
896 | 1 | assert!(actual_min.sub(expected_min).unwrap() < eps)0 ; |
897 | 1 | |
898 | 1 | assert!(actual_max.sub(expected_max).unwrap() < eps)0 ; |
899 | 1 | assert!(actual_max.sub(expected_max).unwrap() < eps)0 ; |
900 | 1 | } else { |
901 | 1 | assert_eq!(actual, expected)0 ; |
902 | 1 | } |
903 | 1 | } else { |
904 | 1 | assert_eq!(actual, expected)0 ; |
905 | 1 | } |
906 | 1 | }0 ); |
907 | 1 | |
908 | 1 | Ok(()) |
909 | 1 | } |
910 | | |
911 | | #[tokio::test] |
912 | 1 | async fn test_filter_statistics_full_selective() -> Result<()> { |
913 | 1 | // Table: |
914 | 1 | // a: min=1, max=100 |
915 | 1 | // b: min=1, max=3 |
916 | 1 | let schema = Schema::new(vec![ |
917 | 1 | Field::new("a", DataType::Int32, false), |
918 | 1 | Field::new("b", DataType::Int32, false), |
919 | 1 | ]); |
920 | 1 | let input = Arc::new(StatisticsExec::new( |
921 | 1 | Statistics { |
922 | 1 | num_rows: Precision::Inexact(1000), |
923 | 1 | total_byte_size: Precision::Inexact(4000), |
924 | 1 | column_statistics: vec![ |
925 | 1 | ColumnStatistics { |
926 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
927 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
928 | 1 | ..Default::default() |
929 | 1 | }, |
930 | 1 | ColumnStatistics { |
931 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
932 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
933 | 1 | ..Default::default() |
934 | 1 | }, |
935 | 1 | ], |
936 | 1 | }, |
937 | 1 | schema, |
938 | 1 | )); |
939 | 1 | // WHERE a<200 AND 1<=b |
940 | 1 | let predicate = Arc::new(BinaryExpr::new( |
941 | 1 | Arc::new(BinaryExpr::new( |
942 | 1 | Arc::new(Column::new("a", 0)), |
943 | 1 | Operator::Lt, |
944 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(200)))), |
945 | 1 | )), |
946 | 1 | Operator::And, |
947 | 1 | Arc::new(BinaryExpr::new( |
948 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(1)))), |
949 | 1 | Operator::LtEq, |
950 | 1 | Arc::new(Column::new("b", 1)), |
951 | 1 | )), |
952 | 1 | )); |
953 | 1 | // Since filter predicate passes all entries, statistics after filter shouldn't change. |
954 | 1 | let expected = input.statistics()?0 .column_statistics; |
955 | 1 | let filter: Arc<dyn ExecutionPlan> = |
956 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
957 | 1 | let statistics = filter.statistics()?0 ; |
958 | 1 | |
959 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(1000)); |
960 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(4000)); |
961 | 1 | assert_eq!(statistics.column_statistics, expected); |
962 | 1 | |
963 | 1 | Ok(()) |
964 | 1 | } |
965 | | |
966 | | #[tokio::test] |
967 | 1 | async fn test_filter_statistics_zero_selective() -> Result<()> { |
968 | 1 | // Table: |
969 | 1 | // a: min=1, max=100 |
970 | 1 | // b: min=1, max=3 |
971 | 1 | let schema = Schema::new(vec![ |
972 | 1 | Field::new("a", DataType::Int32, false), |
973 | 1 | Field::new("b", DataType::Int32, false), |
974 | 1 | ]); |
975 | 1 | let input = Arc::new(StatisticsExec::new( |
976 | 1 | Statistics { |
977 | 1 | num_rows: Precision::Inexact(1000), |
978 | 1 | total_byte_size: Precision::Inexact(4000), |
979 | 1 | column_statistics: vec![ |
980 | 1 | ColumnStatistics { |
981 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
982 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
983 | 1 | ..Default::default() |
984 | 1 | }, |
985 | 1 | ColumnStatistics { |
986 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
987 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
988 | 1 | ..Default::default() |
989 | 1 | }, |
990 | 1 | ], |
991 | 1 | }, |
992 | 1 | schema, |
993 | 1 | )); |
994 | 1 | // WHERE a>200 AND 1<=b |
995 | 1 | let predicate = Arc::new(BinaryExpr::new( |
996 | 1 | Arc::new(BinaryExpr::new( |
997 | 1 | Arc::new(Column::new("a", 0)), |
998 | 1 | Operator::Gt, |
999 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(200)))), |
1000 | 1 | )), |
1001 | 1 | Operator::And, |
1002 | 1 | Arc::new(BinaryExpr::new( |
1003 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(1)))), |
1004 | 1 | Operator::LtEq, |
1005 | 1 | Arc::new(Column::new("b", 1)), |
1006 | 1 | )), |
1007 | 1 | )); |
1008 | 1 | let filter: Arc<dyn ExecutionPlan> = |
1009 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
1010 | 1 | let statistics = filter.statistics()?0 ; |
1011 | 1 | |
1012 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(0)); |
1013 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(0)); |
1014 | 1 | assert_eq!( |
1015 | 1 | statistics.column_statistics, |
1016 | 1 | vec![ |
1017 | 1 | ColumnStatistics { |
1018 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1019 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
1020 | 1 | ..Default::default() |
1021 | 1 | }, |
1022 | 1 | ColumnStatistics { |
1023 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1024 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(3))), |
1025 | 1 | ..Default::default() |
1026 | 1 | }, |
1027 | 1 | ] |
1028 | 1 | ); |
1029 | 1 | |
1030 | 1 | Ok(()) |
1031 | 1 | } |
1032 | | |
1033 | | #[tokio::test] |
1034 | 1 | async fn test_filter_statistics_more_inputs() -> Result<()> { |
1035 | 1 | let schema = Schema::new(vec![ |
1036 | 1 | Field::new("a", DataType::Int32, false), |
1037 | 1 | Field::new("b", DataType::Int32, false), |
1038 | 1 | ]); |
1039 | 1 | let input = Arc::new(StatisticsExec::new( |
1040 | 1 | Statistics { |
1041 | 1 | num_rows: Precision::Inexact(1000), |
1042 | 1 | total_byte_size: Precision::Inexact(4000), |
1043 | 1 | column_statistics: vec![ |
1044 | 1 | ColumnStatistics { |
1045 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1046 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
1047 | 1 | ..Default::default() |
1048 | 1 | }, |
1049 | 1 | ColumnStatistics { |
1050 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1051 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
1052 | 1 | ..Default::default() |
1053 | 1 | }, |
1054 | 1 | ], |
1055 | 1 | }, |
1056 | 1 | schema, |
1057 | 1 | )); |
1058 | 1 | // WHERE a<50 |
1059 | 1 | let predicate = Arc::new(BinaryExpr::new( |
1060 | 1 | Arc::new(Column::new("a", 0)), |
1061 | 1 | Operator::Lt, |
1062 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(50)))), |
1063 | 1 | )); |
1064 | 1 | let filter: Arc<dyn ExecutionPlan> = |
1065 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
1066 | 1 | let statistics = filter.statistics()?0 ; |
1067 | 1 | |
1068 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(490)); |
1069 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(1960)); |
1070 | 1 | assert_eq!( |
1071 | 1 | statistics.column_statistics, |
1072 | 1 | vec![ |
1073 | 1 | ColumnStatistics { |
1074 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1075 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(49))), |
1076 | 1 | ..Default::default() |
1077 | 1 | }, |
1078 | 1 | ColumnStatistics { |
1079 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(1))), |
1080 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(100))), |
1081 | 1 | ..Default::default() |
1082 | 1 | }, |
1083 | 1 | ] |
1084 | 1 | ); |
1085 | 1 | |
1086 | 1 | Ok(()) |
1087 | 1 | } |
1088 | | |
1089 | | #[tokio::test] |
1090 | 1 | async fn test_empty_input_statistics() -> Result<()> { |
1091 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
1092 | 1 | let input = Arc::new(StatisticsExec::new( |
1093 | 1 | Statistics::new_unknown(&schema), |
1094 | 1 | schema, |
1095 | 1 | )); |
1096 | 1 | // WHERE a <= 10 AND 0 <= a - 5 |
1097 | 1 | let predicate = Arc::new(BinaryExpr::new( |
1098 | 1 | Arc::new(BinaryExpr::new( |
1099 | 1 | Arc::new(Column::new("a", 0)), |
1100 | 1 | Operator::LtEq, |
1101 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(10)))), |
1102 | 1 | )), |
1103 | 1 | Operator::And, |
1104 | 1 | Arc::new(BinaryExpr::new( |
1105 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(0)))), |
1106 | 1 | Operator::LtEq, |
1107 | 1 | Arc::new(BinaryExpr::new( |
1108 | 1 | Arc::new(Column::new("a", 0)), |
1109 | 1 | Operator::Minus, |
1110 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(5)))), |
1111 | 1 | )), |
1112 | 1 | )), |
1113 | 1 | )); |
1114 | 1 | let filter: Arc<dyn ExecutionPlan> = |
1115 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
1116 | 1 | let filter_statistics = filter.statistics()?0 ; |
1117 | 1 | |
1118 | 1 | let expected_filter_statistics = Statistics { |
1119 | 1 | num_rows: Precision::Absent, |
1120 | 1 | total_byte_size: Precision::Absent, |
1121 | 1 | column_statistics: vec![ColumnStatistics { |
1122 | 1 | null_count: Precision::Absent, |
1123 | 1 | min_value: Precision::Inexact(ScalarValue::Int32(Some(5))), |
1124 | 1 | max_value: Precision::Inexact(ScalarValue::Int32(Some(10))), |
1125 | 1 | distinct_count: Precision::Absent, |
1126 | 1 | }], |
1127 | 1 | }; |
1128 | 1 | |
1129 | 1 | assert_eq!(filter_statistics, expected_filter_statistics); |
1130 | 1 | |
1131 | 1 | Ok(()) |
1132 | 1 | } |
1133 | | |
1134 | | #[tokio::test] |
1135 | 1 | async fn test_statistics_with_constant_column() -> Result<()> { |
1136 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
1137 | 1 | let input = Arc::new(StatisticsExec::new( |
1138 | 1 | Statistics::new_unknown(&schema), |
1139 | 1 | schema, |
1140 | 1 | )); |
1141 | 1 | // WHERE a = 10 |
1142 | 1 | let predicate = Arc::new(BinaryExpr::new( |
1143 | 1 | Arc::new(Column::new("a", 0)), |
1144 | 1 | Operator::Eq, |
1145 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(10)))), |
1146 | 1 | )); |
1147 | 1 | let filter: Arc<dyn ExecutionPlan> = |
1148 | 1 | Arc::new(FilterExec::try_new(predicate, input)?0 ); |
1149 | 1 | let filter_statistics = filter.statistics()?0 ; |
1150 | 1 | // First column is "a", and it is a column with only one value after the filter. |
1151 | 1 | assert!(filter_statistics.column_statistics[0].is_singleton()); |
1152 | 1 | |
1153 | 1 | Ok(()) |
1154 | 1 | } |
1155 | | |
1156 | | #[tokio::test] |
1157 | 1 | async fn test_validation_filter_selectivity() -> Result<()> { |
1158 | 1 | let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); |
1159 | 1 | let input = Arc::new(StatisticsExec::new( |
1160 | 1 | Statistics::new_unknown(&schema), |
1161 | 1 | schema, |
1162 | 1 | )); |
1163 | 1 | // WHERE a = 10 |
1164 | 1 | let predicate = Arc::new(BinaryExpr::new( |
1165 | 1 | Arc::new(Column::new("a", 0)), |
1166 | 1 | Operator::Eq, |
1167 | 1 | Arc::new(Literal::new(ScalarValue::Int32(Some(10)))), |
1168 | 1 | )); |
1169 | 1 | let filter = FilterExec::try_new(predicate, input)?0 ; |
1170 | 1 | assert!(filter.with_default_selectivity(120).is_err()); |
1171 | 1 | Ok(()) |
1172 | 1 | } |
1173 | | |
1174 | | #[tokio::test] |
1175 | 1 | async fn test_custom_filter_selectivity() -> Result<()> { |
1176 | 1 | // Need a decimal to trigger inexact selectivity |
1177 | 1 | let schema = |
1178 | 1 | Schema::new(vec![Field::new("a", DataType::Decimal128(2, 3), false)]); |
1179 | 1 | let input = Arc::new(StatisticsExec::new( |
1180 | 1 | Statistics { |
1181 | 1 | num_rows: Precision::Inexact(1000), |
1182 | 1 | total_byte_size: Precision::Inexact(4000), |
1183 | 1 | column_statistics: vec![ColumnStatistics { |
1184 | 1 | ..Default::default() |
1185 | 1 | }], |
1186 | 1 | }, |
1187 | 1 | schema, |
1188 | 1 | )); |
1189 | 1 | // WHERE a = 10 |
1190 | 1 | let predicate = Arc::new(BinaryExpr::new( |
1191 | 1 | Arc::new(Column::new("a", 0)), |
1192 | 1 | Operator::Eq, |
1193 | 1 | Arc::new(Literal::new(ScalarValue::Decimal128(Some(10), 10, 10))), |
1194 | 1 | )); |
1195 | 1 | let filter = FilterExec::try_new(predicate, input)?0 ; |
1196 | 1 | let statistics = filter.statistics()?0 ; |
1197 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(200)); |
1198 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(800)); |
1199 | 1 | let filter = filter.with_default_selectivity(40)?0 ; |
1200 | 1 | let statistics = filter.statistics()?0 ; |
1201 | 1 | assert_eq!(statistics.num_rows, Precision::Inexact(400)); |
1202 | 1 | assert_eq!(statistics.total_byte_size, Precision::Inexact(1600)); |
1203 | 1 | Ok(()) |
1204 | 1 | } |
1205 | | |
1206 | | #[test] |
1207 | 1 | fn test_equivalence_properties_union_type() -> Result<()> { |
1208 | 1 | let union_type = DataType::Union( |
1209 | 1 | UnionFields::new( |
1210 | 1 | vec![0, 1], |
1211 | 1 | vec![ |
1212 | 1 | Field::new("f1", DataType::Int32, true), |
1213 | 1 | Field::new("f2", DataType::Utf8, true), |
1214 | 1 | ], |
1215 | 1 | ), |
1216 | 1 | UnionMode::Sparse, |
1217 | 1 | ); |
1218 | 1 | |
1219 | 1 | let schema = Arc::new(Schema::new(vec![ |
1220 | 1 | Field::new("c1", DataType::Int32, true), |
1221 | 1 | Field::new("c2", union_type, true), |
1222 | 1 | ])); |
1223 | | |
1224 | 1 | let exec = FilterExec::try_new( |
1225 | | binary( |
1226 | 1 | binary(col("c1", &schema)?0 , Operator::GtEq, lit(1i32), &schema)?0 , |
1227 | 1 | Operator::And, |
1228 | 1 | binary(col("c1", &schema)?0 , Operator::LtEq, lit(4i32), &schema)?0 , |
1229 | 1 | &schema, |
1230 | 0 | )?, |
1231 | 1 | Arc::new(EmptyExec::new(Arc::clone(&schema))), |
1232 | 0 | )?; |
1233 | | |
1234 | 1 | exec.statistics().unwrap(); |
1235 | 1 | |
1236 | 1 | Ok(()) |
1237 | 1 | } |
1238 | | } |