/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/analysis.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Interval and selectivity in [`AnalysisContext`] |
19 | | |
20 | | use std::fmt::Debug; |
21 | | use std::sync::Arc; |
22 | | |
23 | | use crate::expressions::Column; |
24 | | use crate::intervals::cp_solver::{ExprIntervalGraph, PropagationResult}; |
25 | | use crate::utils::collect_columns; |
26 | | use crate::PhysicalExpr; |
27 | | |
28 | | use arrow::datatypes::Schema; |
29 | | use datafusion_common::stats::Precision; |
30 | | use datafusion_common::{ |
31 | | internal_datafusion_err, internal_err, ColumnStatistics, Result, ScalarValue, |
32 | | }; |
33 | | use datafusion_expr::interval_arithmetic::{cardinality_ratio, Interval}; |
34 | | |
35 | | /// The shared context used during the analysis of an expression. Includes |
36 | | /// the boundaries for all known columns. |
37 | | #[derive(Clone, Debug, PartialEq)] |
38 | | pub struct AnalysisContext { |
39 | | // A list of known column boundaries, ordered by the index |
40 | | // of the column in the current schema. |
41 | | pub boundaries: Vec<ExprBoundaries>, |
42 | | /// The estimated percentage of rows that this expression would select, if |
43 | | /// it were to be used as a boolean predicate on a filter. The value will be |
44 | | /// between 0.0 (selects nothing) and 1.0 (selects everything). |
45 | | pub selectivity: Option<f64>, |
46 | | } |
47 | | |
48 | | impl AnalysisContext { |
49 | 66 | pub fn new(boundaries: Vec<ExprBoundaries>) -> Self { |
50 | 66 | Self { |
51 | 66 | boundaries, |
52 | 66 | selectivity: None, |
53 | 66 | } |
54 | 66 | } |
55 | | |
56 | 33 | pub fn with_selectivity(mut self, selectivity: f64) -> Self { |
57 | 33 | self.selectivity = Some(selectivity); |
58 | 33 | self |
59 | 33 | } |
60 | | |
61 | | /// Create a new analysis context from column statistics. |
62 | 33 | pub fn try_from_statistics( |
63 | 33 | input_schema: &Schema, |
64 | 33 | statistics: &[ColumnStatistics], |
65 | 33 | ) -> Result<Self> { |
66 | 33 | statistics |
67 | 33 | .iter() |
68 | 33 | .enumerate() |
69 | 54 | .map(|(idx, stats)| ExprBoundaries::try_from_column(input_schema, stats, idx)) |
70 | 33 | .collect::<Result<Vec<_>>>() |
71 | 33 | .map(Self::new) |
72 | 33 | } |
73 | | } |
74 | | |
75 | | /// Represents the boundaries (e.g. min and max values) of a particular column |
76 | | /// |
77 | | /// This is used range analysis of expressions, to determine if the expression |
78 | | /// limits the value of particular columns (e.g. analyzing an expression such as |
79 | | /// `time < 50` would result in a boundary interval for `time` having a max |
80 | | /// value of `50`). |
81 | | #[derive(Clone, Debug, PartialEq)] |
82 | | pub struct ExprBoundaries { |
83 | | pub column: Column, |
84 | | /// Minimum and maximum values this expression can have. |
85 | | pub interval: Interval, |
86 | | /// Maximum number of distinct values this expression can produce, if known. |
87 | | pub distinct_count: Precision<usize>, |
88 | | } |
89 | | |
90 | | impl ExprBoundaries { |
91 | | /// Create a new `ExprBoundaries` object from column level statistics. |
92 | 54 | pub fn try_from_column( |
93 | 54 | schema: &Schema, |
94 | 54 | col_stats: &ColumnStatistics, |
95 | 54 | col_index: usize, |
96 | 54 | ) -> Result<Self> { |
97 | 54 | let field = schema.fields().get(col_index).ok_or_else(|| { |
98 | 0 | internal_datafusion_err!( |
99 | 0 | "Could not create `ExprBoundaries`: in `try_from_column` `col_index` |
100 | 0 | has gone out of bounds with a value of {col_index}, the schema has {} columns.", |
101 | 0 | schema.fields.len() |
102 | 0 | ) |
103 | 54 | })?0 ; |
104 | 54 | let empty_field = |
105 | 54 | ScalarValue::try_from(field.data_type()).unwrap_or(ScalarValue::Null); |
106 | 54 | let interval = Interval::try_new( |
107 | 54 | col_stats |
108 | 54 | .min_value |
109 | 54 | .get_value() |
110 | 54 | .cloned() |
111 | 54 | .unwrap_or(empty_field.clone()), |
112 | 54 | col_stats |
113 | 54 | .max_value |
114 | 54 | .get_value() |
115 | 54 | .cloned() |
116 | 54 | .unwrap_or(empty_field), |
117 | 54 | )?0 ; |
118 | 54 | let column = Column::new(field.name(), col_index); |
119 | 54 | Ok(ExprBoundaries { |
120 | 54 | column, |
121 | 54 | interval, |
122 | 54 | distinct_count: col_stats.distinct_count, |
123 | 54 | }) |
124 | 54 | } |
125 | | |
126 | | /// Create `ExprBoundaries` that represent no known bounds for all the |
127 | | /// columns in `schema` |
128 | 0 | pub fn try_new_unbounded(schema: &Schema) -> Result<Vec<Self>> { |
129 | 0 | schema |
130 | 0 | .fields() |
131 | 0 | .iter() |
132 | 0 | .enumerate() |
133 | 0 | .map(|(i, field)| { |
134 | 0 | Ok(Self { |
135 | 0 | column: Column::new(field.name(), i), |
136 | 0 | interval: Interval::make_unbounded(field.data_type())?, |
137 | 0 | distinct_count: Precision::Absent, |
138 | | }) |
139 | 0 | }) |
140 | 0 | .collect() |
141 | 0 | } |
142 | | } |
143 | | |
144 | | /// Attempts to refine column boundaries and compute a selectivity value. |
145 | | /// |
146 | | /// The function accepts boundaries of the input columns in the `context` parameter. |
147 | | /// It then tries to tighten these boundaries based on the provided `expr`. |
148 | | /// The resulting selectivity value is calculated by comparing the initial and final boundaries. |
149 | | /// The computation assumes that the data within the column is uniformly distributed and not sorted. |
150 | | /// |
151 | | /// # Arguments |
152 | | /// |
153 | | /// * `context` - The context holding input column boundaries. |
154 | | /// * `expr` - The expression used to shrink the column boundaries. |
155 | | /// |
156 | | /// # Returns |
157 | | /// |
158 | | /// * `AnalysisContext` constructed by pruned boundaries and a selectivity value. |
159 | 33 | pub fn analyze( |
160 | 33 | expr: &Arc<dyn PhysicalExpr>, |
161 | 33 | context: AnalysisContext, |
162 | 33 | schema: &Schema, |
163 | 33 | ) -> Result<AnalysisContext> { |
164 | 33 | let target_boundaries = context.boundaries; |
165 | | |
166 | 33 | let mut graph = ExprIntervalGraph::try_new(Arc::clone(expr), schema)?0 ; |
167 | | |
168 | 33 | let columns = collect_columns(expr) |
169 | 33 | .into_iter() |
170 | 41 | .map(|c| Arc::new(c) as _) |
171 | 33 | .collect::<Vec<_>>(); |
172 | 33 | |
173 | 33 | let target_expr_and_indices = graph.gather_node_indices(columns.as_slice()); |
174 | 33 | |
175 | 33 | let mut target_indices_and_boundaries = target_expr_and_indices |
176 | 33 | .iter() |
177 | 41 | .filter_map(|(expr, i)| { |
178 | 54 | target_boundaries.iter().find_map(|bound| { |
179 | 54 | expr.as_any() |
180 | 54 | .downcast_ref::<Column>() |
181 | 54 | .filter(|expr_column| bound.column.eq(*expr_column)) |
182 | 54 | .map(|_| (*i, bound.interval.clone())41 ) |
183 | 54 | }) |
184 | 41 | }) |
185 | 33 | .collect::<Vec<_>>(); |
186 | 33 | |
187 | 33 | match graph |
188 | 33 | .update_ranges(&mut target_indices_and_boundaries, Interval::CERTAINLY_TRUE)?0 |
189 | | { |
190 | | PropagationResult::Success => { |
191 | 29 | shrink_boundaries(graph, target_boundaries, target_expr_and_indices) |
192 | | } |
193 | | PropagationResult::Infeasible => { |
194 | 2 | Ok(AnalysisContext::new(target_boundaries).with_selectivity(0.0)) |
195 | | } |
196 | | PropagationResult::CannotPropagate => { |
197 | 2 | Ok(AnalysisContext::new(target_boundaries).with_selectivity(1.0)) |
198 | | } |
199 | | } |
200 | 33 | } |
201 | | |
202 | | /// If the `PropagationResult` indicates success, this function calculates the |
203 | | /// selectivity value by comparing the initial and final column boundaries. |
204 | | /// Following this, it constructs and returns a new `AnalysisContext` with the |
205 | | /// updated parameters. |
206 | 29 | fn shrink_boundaries( |
207 | 29 | graph: ExprIntervalGraph, |
208 | 29 | mut target_boundaries: Vec<ExprBoundaries>, |
209 | 29 | target_expr_and_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>, |
210 | 29 | ) -> Result<AnalysisContext> { |
211 | 29 | let initial_boundaries = target_boundaries.clone(); |
212 | 33 | target_expr_and_indices.iter().for_each(|(expr, i)| { |
213 | 33 | if let Some(column) = expr.as_any().downcast_ref::<Column>() { |
214 | 33 | if let Some(bound) = target_boundaries |
215 | 33 | .iter_mut() |
216 | 42 | .find(|bound| bound.column.eq(column))33 |
217 | 33 | { |
218 | 33 | bound.interval = graph.get_interval(*i); |
219 | 33 | };0 |
220 | 0 | } |
221 | 33 | }); |
222 | 29 | |
223 | 29 | let selectivity = calculate_selectivity(&target_boundaries, &initial_boundaries); |
224 | 29 | |
225 | 29 | if !(0.0..=1.0).contains(&selectivity) { |
226 | 0 | return internal_err!("Selectivity is out of limit: {}", selectivity); |
227 | 29 | } |
228 | 29 | |
229 | 29 | Ok(AnalysisContext::new(target_boundaries).with_selectivity(selectivity)) |
230 | 29 | } |
231 | | |
232 | | /// This function calculates the filter predicate's selectivity by comparing |
233 | | /// the initial and pruned column boundaries. Selectivity is defined as the |
234 | | /// ratio of rows in a table that satisfy the filter's predicate. |
235 | 29 | fn calculate_selectivity( |
236 | 29 | target_boundaries: &[ExprBoundaries], |
237 | 29 | initial_boundaries: &[ExprBoundaries], |
238 | 29 | ) -> f64 { |
239 | 29 | // Since the intervals are assumed uniform and the values |
240 | 29 | // are not correlated, we need to multiply the selectivities |
241 | 29 | // of multiple columns to get the overall selectivity. |
242 | 29 | initial_boundaries |
243 | 29 | .iter() |
244 | 29 | .zip(target_boundaries.iter()) |
245 | 46 | .fold(1.0, |acc, (initial, target)| { |
246 | 46 | acc * cardinality_ratio(&initial.interval, &target.interval) |
247 | 46 | }) |
248 | 29 | } |