Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/window/built_in.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Physical exec for built-in window function expressions.
19
20
use std::any::Any;
21
use std::ops::Range;
22
use std::sync::Arc;
23
24
use super::{BuiltInWindowFunctionExpr, WindowExpr};
25
use crate::expressions::PhysicalSortExpr;
26
use crate::window::window_expr::{get_orderby_values, WindowFn};
27
use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState};
28
use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr};
29
use arrow::array::{new_empty_array, ArrayRef};
30
use arrow::compute::SortOptions;
31
use arrow::datatypes::Field;
32
use arrow::record_batch::RecordBatch;
33
use datafusion_common::utils::evaluate_partition_ranges;
34
use datafusion_common::{Result, ScalarValue};
35
use datafusion_expr::window_state::{WindowAggState, WindowFrameContext};
36
use datafusion_expr::WindowFrame;
37
38
/// A window expr that takes the form of a [`BuiltInWindowFunctionExpr`].
39
#[derive(Debug)]
40
pub struct BuiltInWindowExpr {
41
    expr: Arc<dyn BuiltInWindowFunctionExpr>,
42
    partition_by: Vec<Arc<dyn PhysicalExpr>>,
43
    order_by: Vec<PhysicalSortExpr>,
44
    window_frame: Arc<WindowFrame>,
45
}
46
47
impl BuiltInWindowExpr {
48
    /// create a new built-in window function expression
49
3
    pub fn new(
50
3
        expr: Arc<dyn BuiltInWindowFunctionExpr>,
51
3
        partition_by: &[Arc<dyn PhysicalExpr>],
52
3
        order_by: &[PhysicalSortExpr],
53
3
        window_frame: Arc<WindowFrame>,
54
3
    ) -> Self {
55
3
        Self {
56
3
            expr,
57
3
            partition_by: partition_by.to_vec(),
58
3
            order_by: order_by.to_vec(),
59
3
            window_frame,
60
3
        }
61
3
    }
62
63
    /// Get BuiltInWindowFunction expr of BuiltInWindowExpr
64
0
    pub fn get_built_in_func_expr(&self) -> &Arc<dyn BuiltInWindowFunctionExpr> {
65
0
        &self.expr
66
0
    }
67
68
    /// Adds any equivalent orderings generated by the `self.expr`
69
    /// to `builder`.
70
    ///
71
    /// If `self.expr` doesn't have an ordering, ordering equivalence properties
72
    /// are not updated. Otherwise, ordering equivalence properties are updated
73
    /// by the ordering of `self.expr`.
74
3
    pub fn add_equal_orderings(&self, eq_properties: &mut EquivalenceProperties) {
75
3
        let schema = eq_properties.schema();
76
3
        if let Some(
fn_res_ordering0
) = self.expr.get_result_ordering(schema) {
77
0
            if self.partition_by.is_empty() {
78
0
                // In the absence of a PARTITION BY, ordering of `self.expr` is global:
79
0
                eq_properties.add_new_orderings([vec![fn_res_ordering]]);
80
0
            } else {
81
                // If we have a PARTITION BY, built-in functions can not introduce
82
                // a global ordering unless the existing ordering is compatible
83
                // with PARTITION BY expressions. To elaborate, when PARTITION BY
84
                // expressions and existing ordering expressions are equal (w.r.t.
85
                // set equality), we can prefix the ordering of `self.expr` with
86
                // the existing ordering.
87
0
                let (mut ordering, _) =
88
0
                    eq_properties.find_longest_permutation(&self.partition_by);
89
0
                if ordering.len() == self.partition_by.len() {
90
0
                    ordering.push(fn_res_ordering);
91
0
                    eq_properties.add_new_orderings([ordering]);
92
0
                }
93
            }
94
3
        }
95
3
    }
96
}
97
98
impl WindowExpr for BuiltInWindowExpr {
99
    /// Return a reference to Any that can be used for downcasting
100
3
    fn as_any(&self) -> &dyn Any {
101
3
        self
102
3
    }
103
104
3
    fn name(&self) -> &str {
105
3
        self.expr.name()
106
3
    }
107
108
6
    fn field(&self) -> Result<Field> {
109
6
        self.expr.field()
110
6
    }
111
112
12
    fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> {
113
12
        self.expr.expressions()
114
12
    }
115
116
4
    fn partition_by(&self) -> &[Arc<dyn PhysicalExpr>] {
117
4
        &self.partition_by
118
4
    }
119
120
12
    fn order_by(&self) -> &[PhysicalSortExpr] {
121
12
        &self.order_by
122
12
    }
123
124
0
    fn evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef> {
125
0
        let mut evaluator = self.expr.create_evaluator()?;
126
0
        let num_rows = batch.num_rows();
127
0
        if evaluator.uses_window_frame() {
128
0
            let sort_options: Vec<SortOptions> =
129
0
                self.order_by.iter().map(|o| o.options).collect();
130
0
            let mut row_wise_results = vec![];
131
132
0
            let mut values = self.evaluate_args(batch)?;
133
0
            let order_bys = get_orderby_values(self.order_by_columns(batch)?);
134
0
            let n_args = values.len();
135
0
            values.extend(order_bys);
136
0
            let order_bys_ref = &values[n_args..];
137
0
138
0
            let mut window_frame_ctx =
139
0
                WindowFrameContext::new(Arc::clone(&self.window_frame), sort_options);
140
0
            let mut last_range = Range { start: 0, end: 0 };
141
            // We iterate on each row to calculate window frame range and and window function result
142
0
            for idx in 0..num_rows {
143
0
                let range = window_frame_ctx.calculate_range(
144
0
                    order_bys_ref,
145
0
                    &last_range,
146
0
                    num_rows,
147
0
                    idx,
148
0
                )?;
149
0
                let value = evaluator.evaluate(&values, &range)?;
150
0
                row_wise_results.push(value);
151
0
                last_range = range;
152
            }
153
0
            ScalarValue::iter_to_array(row_wise_results)
154
0
        } else if evaluator.include_rank() {
155
0
            let columns = self.order_by_columns(batch)?;
156
0
            let sort_partition_points = evaluate_partition_ranges(num_rows, &columns)?;
157
0
            evaluator.evaluate_all_with_rank(num_rows, &sort_partition_points)
158
        } else {
159
0
            let values = self.evaluate_args(batch)?;
160
0
            evaluator.evaluate_all(&values, num_rows)
161
        }
162
0
    }
163
164
    /// Evaluate the window function against the batch. This function facilitates
165
    /// stateful, bounded-memory implementations.
166
12
    fn evaluate_stateful(
167
12
        &self,
168
12
        partition_batches: &PartitionBatches,
169
12
        window_agg_state: &mut PartitionWindowAggStates,
170
12
    ) -> Result<()> {
171
12
        let field = self.expr.field()
?0
;
172
12
        let out_type = field.data_type();
173
12
        let sort_options = self.order_by.iter().map(|o| 
o.options0
).collect::<Vec<_>>();
174
12
        for (partition_row, partition_batch_state) in partition_batches.iter() {
175
12
            let window_state =
176
12
                if let Some(
window_state9
) = window_agg_state.get_mut(partition_row) {
177
9
                    window_state
178
                } else {
179
3
                    let evaluator = self.expr.create_evaluator()
?0
;
180
3
                    window_agg_state
181
3
                        .entry(partition_row.clone())
182
3
                        .or_insert(WindowState {
183
3
                            state: WindowAggState::new(out_type)
?0
,
184
3
                            window_fn: WindowFn::Builtin(evaluator),
185
                        })
186
                };
187
12
            let evaluator = match &mut window_state.window_fn {
188
12
                WindowFn::Builtin(evaluator) => evaluator,
189
0
                _ => unreachable!(),
190
            };
191
12
            let state = &mut window_state.state;
192
12
193
12
            let batch_ref = &partition_batch_state.record_batch;
194
12
            let mut values = self.evaluate_args(batch_ref)
?0
;
195
12
            let order_bys = if evaluator.uses_window_frame() || 
evaluator.include_rank()0
{
196
12
                get_orderby_values(self.order_by_columns(batch_ref)
?0
)
197
            } else {
198
0
                vec![]
199
            };
200
12
            let n_args = values.len();
201
12
            values.extend(order_bys);
202
12
            let order_bys_ref = &values[n_args..];
203
12
204
12
            // We iterate on each row to perform a running calculation.
205
12
            let record_batch = &partition_batch_state.record_batch;
206
12
            let num_rows = record_batch.num_rows();
207
12
            let mut row_wise_results: Vec<ScalarValue> = vec![];
208
12
            let is_causal = if evaluator.uses_window_frame() {
209
12
                self.window_frame.is_causal()
210
            } else {
211
0
                evaluator.is_causal()
212
            };
213
27
            for idx in 
state.last_calculated_index..num_rows12
{
214
27
                let frame_range = if evaluator.uses_window_frame() {
215
27
                    state
216
27
                        .window_frame_ctx
217
27
                        .get_or_insert_with(|| {
218
3
                            WindowFrameContext::new(
219
3
                                Arc::clone(&self.window_frame),
220
3
                                sort_options.clone(),
221
3
                            )
222
27
                        })
223
27
                        .calculate_range(
224
27
                            order_bys_ref,
225
27
                            // Start search from the last range
226
27
                            &state.window_frame_range,
227
27
                            num_rows,
228
27
                            idx,
229
27
                        )
230
                } else {
231
0
                    evaluator.get_range(idx, num_rows)
232
0
                }?;
233
234
                // Exit if the range is non-causal and extends all the way:
235
27
                if frame_range.end == num_rows
236
9
                    && !is_causal
237
0
                    && !partition_batch_state.is_end
238
                {
239
0
                    break;
240
27
                }
241
27
                // Update last range
242
27
                state.window_frame_range = frame_range;
243
27
                row_wise_results
244
27
                    .push(evaluator.evaluate(&values, &state.window_frame_range)
?0
);
245
            }
246
12
            let out_col = if row_wise_results.is_empty() {
247
3
                new_empty_array(out_type)
248
            } else {
249
9
                ScalarValue::iter_to_array(row_wise_results.into_iter())
?0
250
            };
251
252
12
            state.update(&out_col, partition_batch_state)
?0
;
253
12
            if self.window_frame.start_bound.is_unbounded() {
254
12
                evaluator.memoize(state)
?0
;
255
0
            }
256
        }
257
12
        Ok(())
258
12
    }
259
260
3
    fn get_window_frame(&self) -> &Arc<WindowFrame> {
261
3
        &self.window_frame
262
3
    }
263
264
0
    fn get_reverse_expr(&self) -> Option<Arc<dyn WindowExpr>> {
265
0
        self.expr.reverse_expr().map(|reverse_expr| {
266
0
            Arc::new(BuiltInWindowExpr::new(
267
0
                reverse_expr,
268
0
                &self.partition_by.clone(),
269
0
                &reverse_order_bys(&self.order_by),
270
0
                Arc::new(self.window_frame.reverse()),
271
0
            )) as _
272
0
        })
273
0
    }
274
275
0
    fn uses_bounded_memory(&self) -> bool {
276
0
        if let Ok(evaluator) = self.expr.create_evaluator() {
277
0
            evaluator.supports_bounded_execution()
278
0
                && (!evaluator.uses_window_frame()
279
0
                    || !self.window_frame.end_bound.is_unbounded())
280
        } else {
281
0
            false
282
        }
283
0
    }
284
}