Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/expr/src/partition_evaluator.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Partition evaluation module
19
20
use arrow::array::ArrayRef;
21
use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue};
22
use std::fmt::Debug;
23
use std::ops::Range;
24
25
use crate::window_state::WindowAggState;
26
27
/// Partition evaluator for Window Functions
28
///
29
/// # Background
30
///
31
/// An implementation of this trait is created and used for each
32
/// partition defined by an `OVER` clause and is instantiated by
33
/// the DataFusion runtime.
34
///
35
/// For example, evaluating `window_func(val) OVER (PARTITION BY col)`
36
/// on the following data:
37
///
38
/// ```text
39
/// col | val
40
/// --- + ----
41
///  A  | 10
42
///  A  | 10
43
///  C  | 20
44
///  D  | 30
45
///  D  | 30
46
/// ```
47
///
48
/// Will instantiate three `PartitionEvaluator`s, one each for the
49
/// partitions defined by `col=A`, `col=B`, and `col=C`.
50
///
51
/// ```text
52
/// col | val
53
/// --- + ----
54
///  A  | 10     <--- partition 1
55
///  A  | 10
56
///
57
/// col | val
58
/// --- + ----
59
///  C  | 20     <--- partition 2
60
///
61
/// col | val
62
/// --- + ----
63
///  D  | 30     <--- partition 3
64
///  D  | 30
65
/// ```
66
///
67
/// Different methods on this trait will be called depending on the
68
/// capabilities described by [`supports_bounded_execution`],
69
/// [`uses_window_frame`], and [`include_rank`],
70
///
71
/// When implementing a new `PartitionEvaluator`, implement
72
/// corresponding evaluator according to table below.
73
///
74
/// # Implementation Table
75
///
76
/// |[`uses_window_frame`]|[`supports_bounded_execution`]|[`include_rank`]|function_to_implement|
77
/// |---|---|----|----|
78
/// |false (default)      |false (default)               |false (default)   | [`evaluate_all`]           |
79
/// |false                |true                          |false             | [`evaluate`]               |
80
/// |false                |true/false                    |true              | [`evaluate_all_with_rank`] |
81
/// |true                 |true/false                    |true/false        | [`evaluate`]               |
82
///
83
/// [`evaluate`]: Self::evaluate
84
/// [`evaluate_all`]: Self::evaluate_all
85
/// [`evaluate_all_with_rank`]: Self::evaluate_all_with_rank
86
/// [`uses_window_frame`]: Self::uses_window_frame
87
/// [`include_rank`]: Self::include_rank
88
/// [`supports_bounded_execution`]: Self::supports_bounded_execution
89
pub trait PartitionEvaluator: Debug + Send {
90
    /// When the window frame has a fixed beginning (e.g UNBOUNDED
91
    /// PRECEDING), some functions such as FIRST_VALUE, LAST_VALUE and
92
    /// NTH_VALUE do not need the (unbounded) input once they have
93
    /// seen a certain amount of input.
94
    ///
95
    /// `memoize` is called after each input batch is processed, and
96
    /// such functions can save whatever they need and modify
97
    /// [`WindowAggState`] appropriately to allow rows to be pruned
98
0
    fn memoize(&mut self, _state: &mut WindowAggState) -> Result<()> {
99
0
        Ok(())
100
0
    }
101
102
    /// If `uses_window_frame` flag is `false`. This method is used to
103
    /// calculate required range for the window function during
104
    /// stateful execution.
105
    ///
106
    /// Generally there is no required range, hence by default this
107
    /// returns smallest range(current row). e.g seeing current row is
108
    /// enough to calculate window result (such as row_number, rank,
109
    /// etc)
110
0
    fn get_range(&self, idx: usize, _n_rows: usize) -> Result<Range<usize>> {
111
0
        if self.uses_window_frame() {
112
0
            exec_err!("Range should be calculated from window frame")
113
        } else {
114
0
            Ok(Range {
115
0
                start: idx,
116
0
                end: idx + 1,
117
0
            })
118
        }
119
0
    }
120
121
    /// Get whether evaluator needs future data for its result (if so returns `false`) or not
122
0
    fn is_causal(&self) -> bool {
123
0
        false
124
0
    }
125
126
    /// Evaluate a window function on an entire input partition.
127
    ///
128
    /// This function is called once per input *partition* for window
129
    /// functions that *do not use* values from the window frame,
130
    /// such as `ROW_NUMBER`, `RANK`, `DENSE_RANK`, `PERCENT_RANK`,
131
    /// `CUME_DIST`, `LEAD`, `LAG`).
132
    ///
133
    /// It produces the result of all rows in a single pass. It
134
    /// expects to receive the entire partition as the `value` and
135
    /// must produce an output column with one output row for every
136
    /// input row.
137
    ///
138
    /// `num_rows` is required to correctly compute the output in case
139
    /// `values.len() == 0`
140
    ///
141
    /// Implementing this function is an optimization: certain window
142
    /// functions are not affected by the window frame definition or
143
    /// the query doesn't have a frame, and `evaluate` skips the
144
    /// (costly) window frame boundary calculation and the overhead of
145
    /// calling `evaluate` for each output row.
146
    ///
147
    /// For example, the `LAG` built in window function does not use
148
    /// the values of its window frame (it can be computed in one shot
149
    /// on the entire partition with `Self::evaluate_all` regardless of the
150
    /// window defined in the `OVER` clause)
151
    ///
152
    /// ```sql
153
    /// lag(x, 1) OVER (ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING)
154
    /// ```
155
    ///
156
    /// However, `avg()` computes the average in the window and thus
157
    /// does use its window frame
158
    ///
159
    /// ```sql
160
    /// avg(x) OVER (PARTITION BY y ORDER BY z ROWS BETWEEN 2 PRECEDING AND 3 FOLLOWING)
161
    /// ```
162
0
    fn evaluate_all(&mut self, values: &[ArrayRef], num_rows: usize) -> Result<ArrayRef> {
163
0
        // When window frame boundaries are not used and evaluator supports bounded execution
164
0
        // We can calculate evaluate result by repeatedly calling `self.evaluate` `num_rows` times
165
0
        // If user wants to implement more efficient version, this method should be overwritten
166
0
        // Default implementation may behave suboptimally (For instance `NumRowEvaluator` overwrites it)
167
0
        if !self.uses_window_frame() && self.supports_bounded_execution() {
168
0
            let res = (0..num_rows)
169
0
                .map(|idx| self.evaluate(values, &self.get_range(idx, num_rows)?))
170
0
                .collect::<Result<Vec<_>>>()?;
171
0
            ScalarValue::iter_to_array(res)
172
        } else {
173
0
            not_impl_err!("evaluate_all is not implemented by default")
174
        }
175
0
    }
176
177
    /// Evaluate window function on a range of rows in an input
178
    /// partition.x
179
    ///
180
    /// This is the simplest and most general function to implement
181
    /// but also the least performant as it creates output one row at
182
    /// a time. It is typically much faster to implement stateful
183
    /// evaluation using one of the other specialized methods on this
184
    /// trait.
185
    ///
186
    /// Returns a [`ScalarValue`] that is the value of the window
187
    /// function within `range` for the entire partition. Argument
188
    /// `values` contains the evaluation result of function arguments
189
    /// and evaluation results of ORDER BY expressions. If function has a
190
    /// single argument, `values[1..]` will contain ORDER BY expression results.
191
0
    fn evaluate(
192
0
        &mut self,
193
0
        _values: &[ArrayRef],
194
0
        _range: &Range<usize>,
195
0
    ) -> Result<ScalarValue> {
196
0
        not_impl_err!("evaluate is not implemented by default")
197
0
    }
198
199
    /// [`PartitionEvaluator::evaluate_all_with_rank`] is called for window
200
    /// functions that only need the rank of a row within its window
201
    /// frame.
202
    ///
203
    /// Evaluate the partition evaluator against the partition using
204
    /// the row ranks. For example, `RANK(col)` produces
205
    ///
206
    /// ```text
207
    /// col | rank
208
    /// --- + ----
209
    ///  A  | 1
210
    ///  A  | 1
211
    ///  C  | 3
212
    ///  D  | 4
213
    ///  D  | 5
214
    /// ```
215
    ///
216
    /// For this case, `num_rows` would be `5` and the
217
    /// `ranks_in_partition` would be called with
218
    ///
219
    /// ```text
220
    /// [
221
    ///   (0,1),
222
    ///   (2,2),
223
    ///   (3,4),
224
    /// ]
225
    /// ```
226
0
    fn evaluate_all_with_rank(
227
0
        &self,
228
0
        _num_rows: usize,
229
0
        _ranks_in_partition: &[Range<usize>],
230
0
    ) -> Result<ArrayRef> {
231
0
        not_impl_err!("evaluate_partition_with_rank is not implemented by default")
232
0
    }
233
234
    /// Can the window function be incrementally computed using
235
    /// bounded memory?
236
    ///
237
    /// See the table on [`Self`] for what functions to implement
238
0
    fn supports_bounded_execution(&self) -> bool {
239
0
        false
240
0
    }
241
242
    /// Does the window function use the values from the window frame,
243
    /// if one is specified?
244
    ///
245
    /// See the table on [`Self`] for what functions to implement
246
0
    fn uses_window_frame(&self) -> bool {
247
0
        false
248
0
    }
249
250
    /// Can this function be evaluated with (only) rank
251
    ///
252
    /// See the table on [`Self`] for what functions to implement
253
0
    fn include_rank(&self) -> bool {
254
0
        false
255
0
    }
256
}