Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/analysis.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Interval and selectivity in [`AnalysisContext`]
19
20
use std::fmt::Debug;
21
use std::sync::Arc;
22
23
use crate::expressions::Column;
24
use crate::intervals::cp_solver::{ExprIntervalGraph, PropagationResult};
25
use crate::utils::collect_columns;
26
use crate::PhysicalExpr;
27
28
use arrow::datatypes::Schema;
29
use datafusion_common::stats::Precision;
30
use datafusion_common::{
31
    internal_datafusion_err, internal_err, ColumnStatistics, Result, ScalarValue,
32
};
33
use datafusion_expr::interval_arithmetic::{cardinality_ratio, Interval};
34
35
/// The shared context used during the analysis of an expression. Includes
36
/// the boundaries for all known columns.
37
#[derive(Clone, Debug, PartialEq)]
38
pub struct AnalysisContext {
39
    // A list of known column boundaries, ordered by the index
40
    // of the column in the current schema.
41
    pub boundaries: Vec<ExprBoundaries>,
42
    /// The estimated percentage of rows that this expression would select, if
43
    /// it were to be used as a boolean predicate on a filter. The value will be
44
    /// between 0.0 (selects nothing) and 1.0 (selects everything).
45
    pub selectivity: Option<f64>,
46
}
47
48
impl AnalysisContext {
49
66
    pub fn new(boundaries: Vec<ExprBoundaries>) -> Self {
50
66
        Self {
51
66
            boundaries,
52
66
            selectivity: None,
53
66
        }
54
66
    }
55
56
33
    pub fn with_selectivity(mut self, selectivity: f64) -> Self {
57
33
        self.selectivity = Some(selectivity);
58
33
        self
59
33
    }
60
61
    /// Create a new analysis context from column statistics.
62
33
    pub fn try_from_statistics(
63
33
        input_schema: &Schema,
64
33
        statistics: &[ColumnStatistics],
65
33
    ) -> Result<Self> {
66
33
        statistics
67
33
            .iter()
68
33
            .enumerate()
69
54
            .map(|(idx, stats)| ExprBoundaries::try_from_column(input_schema, stats, idx))
70
33
            .collect::<Result<Vec<_>>>()
71
33
            .map(Self::new)
72
33
    }
73
}
74
75
/// Represents the boundaries (e.g. min and max values) of a particular column
76
///
77
/// This is used range analysis of expressions, to determine if the expression
78
/// limits the value of particular columns (e.g. analyzing an expression such as
79
/// `time < 50` would result in a boundary interval for `time` having a max
80
/// value of `50`).
81
#[derive(Clone, Debug, PartialEq)]
82
pub struct ExprBoundaries {
83
    pub column: Column,
84
    /// Minimum and maximum values this expression can have.
85
    pub interval: Interval,
86
    /// Maximum number of distinct values this expression can produce, if known.
87
    pub distinct_count: Precision<usize>,
88
}
89
90
impl ExprBoundaries {
91
    /// Create a new `ExprBoundaries` object from column level statistics.
92
54
    pub fn try_from_column(
93
54
        schema: &Schema,
94
54
        col_stats: &ColumnStatistics,
95
54
        col_index: usize,
96
54
    ) -> Result<Self> {
97
54
        let field = schema.fields().get(col_index).ok_or_else(|| {
98
0
            internal_datafusion_err!(
99
0
                "Could not create `ExprBoundaries`: in `try_from_column` `col_index` 
100
0
                has gone out of bounds with a value of {col_index}, the schema has {} columns.",
101
0
                schema.fields.len()
102
0
            )
103
54
        })
?0
;
104
54
        let empty_field =
105
54
            ScalarValue::try_from(field.data_type()).unwrap_or(ScalarValue::Null);
106
54
        let interval = Interval::try_new(
107
54
            col_stats
108
54
                .min_value
109
54
                .get_value()
110
54
                .cloned()
111
54
                .unwrap_or(empty_field.clone()),
112
54
            col_stats
113
54
                .max_value
114
54
                .get_value()
115
54
                .cloned()
116
54
                .unwrap_or(empty_field),
117
54
        )
?0
;
118
54
        let column = Column::new(field.name(), col_index);
119
54
        Ok(ExprBoundaries {
120
54
            column,
121
54
            interval,
122
54
            distinct_count: col_stats.distinct_count,
123
54
        })
124
54
    }
125
126
    /// Create `ExprBoundaries` that represent no known bounds for all the
127
    /// columns in `schema`
128
0
    pub fn try_new_unbounded(schema: &Schema) -> Result<Vec<Self>> {
129
0
        schema
130
0
            .fields()
131
0
            .iter()
132
0
            .enumerate()
133
0
            .map(|(i, field)| {
134
0
                Ok(Self {
135
0
                    column: Column::new(field.name(), i),
136
0
                    interval: Interval::make_unbounded(field.data_type())?,
137
0
                    distinct_count: Precision::Absent,
138
                })
139
0
            })
140
0
            .collect()
141
0
    }
142
}
143
144
/// Attempts to refine column boundaries and compute a selectivity value.
145
///
146
/// The function accepts boundaries of the input columns in the `context` parameter.
147
/// It then tries to tighten these boundaries based on the provided `expr`.
148
/// The resulting selectivity value is calculated by comparing the initial and final boundaries.
149
/// The computation assumes that the data within the column is uniformly distributed and not sorted.
150
///
151
/// # Arguments
152
///
153
/// * `context` - The context holding input column boundaries.
154
/// * `expr` - The expression used to shrink the column boundaries.
155
///
156
/// # Returns
157
///
158
/// * `AnalysisContext` constructed by pruned boundaries and a selectivity value.
159
33
pub fn analyze(
160
33
    expr: &Arc<dyn PhysicalExpr>,
161
33
    context: AnalysisContext,
162
33
    schema: &Schema,
163
33
) -> Result<AnalysisContext> {
164
33
    let target_boundaries = context.boundaries;
165
166
33
    let mut graph = ExprIntervalGraph::try_new(Arc::clone(expr), schema)
?0
;
167
168
33
    let columns = collect_columns(expr)
169
33
        .into_iter()
170
41
        .map(|c| Arc::new(c) as _)
171
33
        .collect::<Vec<_>>();
172
33
173
33
    let target_expr_and_indices = graph.gather_node_indices(columns.as_slice());
174
33
175
33
    let mut target_indices_and_boundaries = target_expr_and_indices
176
33
        .iter()
177
41
        .filter_map(|(expr, i)| {
178
54
            target_boundaries.iter().find_map(|bound| {
179
54
                expr.as_any()
180
54
                    .downcast_ref::<Column>()
181
54
                    .filter(|expr_column| bound.column.eq(*expr_column))
182
54
                    .map(|_| 
(*i, bound.interval.clone())41
)
183
54
            })
184
41
        })
185
33
        .collect::<Vec<_>>();
186
33
187
33
    match graph
188
33
        .update_ranges(&mut target_indices_and_boundaries, Interval::CERTAINLY_TRUE)
?0
189
    {
190
        PropagationResult::Success => {
191
29
            shrink_boundaries(graph, target_boundaries, target_expr_and_indices)
192
        }
193
        PropagationResult::Infeasible => {
194
2
            Ok(AnalysisContext::new(target_boundaries).with_selectivity(0.0))
195
        }
196
        PropagationResult::CannotPropagate => {
197
2
            Ok(AnalysisContext::new(target_boundaries).with_selectivity(1.0))
198
        }
199
    }
200
33
}
201
202
/// If the `PropagationResult` indicates success, this function calculates the
203
/// selectivity value by comparing the initial and final column boundaries.
204
/// Following this, it constructs and returns a new `AnalysisContext` with the
205
/// updated parameters.
206
29
fn shrink_boundaries(
207
29
    graph: ExprIntervalGraph,
208
29
    mut target_boundaries: Vec<ExprBoundaries>,
209
29
    target_expr_and_indices: Vec<(Arc<dyn PhysicalExpr>, usize)>,
210
29
) -> Result<AnalysisContext> {
211
29
    let initial_boundaries = target_boundaries.clone();
212
33
    target_expr_and_indices.iter().for_each(|(expr, i)| {
213
33
        if let Some(column) = expr.as_any().downcast_ref::<Column>() {
214
33
            if let Some(bound) = target_boundaries
215
33
                .iter_mut()
216
42
                .find(|bound| bound.column.eq(column)
)33
217
33
            {
218
33
                bound.interval = graph.get_interval(*i);
219
33
            }
;0
220
0
        }
221
33
    });
222
29
223
29
    let selectivity = calculate_selectivity(&target_boundaries, &initial_boundaries);
224
29
225
29
    if !(0.0..=1.0).contains(&selectivity) {
226
0
        return internal_err!("Selectivity is out of limit: {}", selectivity);
227
29
    }
228
29
229
29
    Ok(AnalysisContext::new(target_boundaries).with_selectivity(selectivity))
230
29
}
231
232
/// This function calculates the filter predicate's selectivity by comparing
233
/// the initial and pruned column boundaries. Selectivity is defined as the
234
/// ratio of rows in a table that satisfy the filter's predicate.
235
29
fn calculate_selectivity(
236
29
    target_boundaries: &[ExprBoundaries],
237
29
    initial_boundaries: &[ExprBoundaries],
238
29
) -> f64 {
239
29
    // Since the intervals are assumed uniform and the values
240
29
    // are not correlated, we need to multiply the selectivities
241
29
    // of multiple columns to get the overall selectivity.
242
29
    initial_boundaries
243
29
        .iter()
244
29
        .zip(target_boundaries.iter())
245
46
        .fold(1.0, |acc, (initial, target)| {
246
46
            acc * cardinality_ratio(&initial.interval, &target.interval)
247
46
        })
248
29
}