/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/window/built_in.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Physical exec for built-in window function expressions. |
19 | | |
20 | | use std::any::Any; |
21 | | use std::ops::Range; |
22 | | use std::sync::Arc; |
23 | | |
24 | | use super::{BuiltInWindowFunctionExpr, WindowExpr}; |
25 | | use crate::expressions::PhysicalSortExpr; |
26 | | use crate::window::window_expr::{get_orderby_values, WindowFn}; |
27 | | use crate::window::{PartitionBatches, PartitionWindowAggStates, WindowState}; |
28 | | use crate::{reverse_order_bys, EquivalenceProperties, PhysicalExpr}; |
29 | | use arrow::array::{new_empty_array, ArrayRef}; |
30 | | use arrow::compute::SortOptions; |
31 | | use arrow::datatypes::Field; |
32 | | use arrow::record_batch::RecordBatch; |
33 | | use datafusion_common::utils::evaluate_partition_ranges; |
34 | | use datafusion_common::{Result, ScalarValue}; |
35 | | use datafusion_expr::window_state::{WindowAggState, WindowFrameContext}; |
36 | | use datafusion_expr::WindowFrame; |
37 | | |
38 | | /// A window expr that takes the form of a [`BuiltInWindowFunctionExpr`]. |
39 | | #[derive(Debug)] |
40 | | pub struct BuiltInWindowExpr { |
41 | | expr: Arc<dyn BuiltInWindowFunctionExpr>, |
42 | | partition_by: Vec<Arc<dyn PhysicalExpr>>, |
43 | | order_by: Vec<PhysicalSortExpr>, |
44 | | window_frame: Arc<WindowFrame>, |
45 | | } |
46 | | |
47 | | impl BuiltInWindowExpr { |
48 | | /// create a new built-in window function expression |
49 | 3 | pub fn new( |
50 | 3 | expr: Arc<dyn BuiltInWindowFunctionExpr>, |
51 | 3 | partition_by: &[Arc<dyn PhysicalExpr>], |
52 | 3 | order_by: &[PhysicalSortExpr], |
53 | 3 | window_frame: Arc<WindowFrame>, |
54 | 3 | ) -> Self { |
55 | 3 | Self { |
56 | 3 | expr, |
57 | 3 | partition_by: partition_by.to_vec(), |
58 | 3 | order_by: order_by.to_vec(), |
59 | 3 | window_frame, |
60 | 3 | } |
61 | 3 | } |
62 | | |
63 | | /// Get BuiltInWindowFunction expr of BuiltInWindowExpr |
64 | 0 | pub fn get_built_in_func_expr(&self) -> &Arc<dyn BuiltInWindowFunctionExpr> { |
65 | 0 | &self.expr |
66 | 0 | } |
67 | | |
68 | | /// Adds any equivalent orderings generated by the `self.expr` |
69 | | /// to `builder`. |
70 | | /// |
71 | | /// If `self.expr` doesn't have an ordering, ordering equivalence properties |
72 | | /// are not updated. Otherwise, ordering equivalence properties are updated |
73 | | /// by the ordering of `self.expr`. |
74 | 3 | pub fn add_equal_orderings(&self, eq_properties: &mut EquivalenceProperties) { |
75 | 3 | let schema = eq_properties.schema(); |
76 | 3 | if let Some(fn_res_ordering0 ) = self.expr.get_result_ordering(schema) { |
77 | 0 | if self.partition_by.is_empty() { |
78 | 0 | // In the absence of a PARTITION BY, ordering of `self.expr` is global: |
79 | 0 | eq_properties.add_new_orderings([vec![fn_res_ordering]]); |
80 | 0 | } else { |
81 | | // If we have a PARTITION BY, built-in functions can not introduce |
82 | | // a global ordering unless the existing ordering is compatible |
83 | | // with PARTITION BY expressions. To elaborate, when PARTITION BY |
84 | | // expressions and existing ordering expressions are equal (w.r.t. |
85 | | // set equality), we can prefix the ordering of `self.expr` with |
86 | | // the existing ordering. |
87 | 0 | let (mut ordering, _) = |
88 | 0 | eq_properties.find_longest_permutation(&self.partition_by); |
89 | 0 | if ordering.len() == self.partition_by.len() { |
90 | 0 | ordering.push(fn_res_ordering); |
91 | 0 | eq_properties.add_new_orderings([ordering]); |
92 | 0 | } |
93 | | } |
94 | 3 | } |
95 | 3 | } |
96 | | } |
97 | | |
98 | | impl WindowExpr for BuiltInWindowExpr { |
99 | | /// Return a reference to Any that can be used for downcasting |
100 | 3 | fn as_any(&self) -> &dyn Any { |
101 | 3 | self |
102 | 3 | } |
103 | | |
104 | 3 | fn name(&self) -> &str { |
105 | 3 | self.expr.name() |
106 | 3 | } |
107 | | |
108 | 6 | fn field(&self) -> Result<Field> { |
109 | 6 | self.expr.field() |
110 | 6 | } |
111 | | |
112 | 12 | fn expressions(&self) -> Vec<Arc<dyn PhysicalExpr>> { |
113 | 12 | self.expr.expressions() |
114 | 12 | } |
115 | | |
116 | 4 | fn partition_by(&self) -> &[Arc<dyn PhysicalExpr>] { |
117 | 4 | &self.partition_by |
118 | 4 | } |
119 | | |
120 | 12 | fn order_by(&self) -> &[PhysicalSortExpr] { |
121 | 12 | &self.order_by |
122 | 12 | } |
123 | | |
124 | 0 | fn evaluate(&self, batch: &RecordBatch) -> Result<ArrayRef> { |
125 | 0 | let mut evaluator = self.expr.create_evaluator()?; |
126 | 0 | let num_rows = batch.num_rows(); |
127 | 0 | if evaluator.uses_window_frame() { |
128 | 0 | let sort_options: Vec<SortOptions> = |
129 | 0 | self.order_by.iter().map(|o| o.options).collect(); |
130 | 0 | let mut row_wise_results = vec![]; |
131 | | |
132 | 0 | let mut values = self.evaluate_args(batch)?; |
133 | 0 | let order_bys = get_orderby_values(self.order_by_columns(batch)?); |
134 | 0 | let n_args = values.len(); |
135 | 0 | values.extend(order_bys); |
136 | 0 | let order_bys_ref = &values[n_args..]; |
137 | 0 |
|
138 | 0 | let mut window_frame_ctx = |
139 | 0 | WindowFrameContext::new(Arc::clone(&self.window_frame), sort_options); |
140 | 0 | let mut last_range = Range { start: 0, end: 0 }; |
141 | | // We iterate on each row to calculate window frame range and and window function result |
142 | 0 | for idx in 0..num_rows { |
143 | 0 | let range = window_frame_ctx.calculate_range( |
144 | 0 | order_bys_ref, |
145 | 0 | &last_range, |
146 | 0 | num_rows, |
147 | 0 | idx, |
148 | 0 | )?; |
149 | 0 | let value = evaluator.evaluate(&values, &range)?; |
150 | 0 | row_wise_results.push(value); |
151 | 0 | last_range = range; |
152 | | } |
153 | 0 | ScalarValue::iter_to_array(row_wise_results) |
154 | 0 | } else if evaluator.include_rank() { |
155 | 0 | let columns = self.order_by_columns(batch)?; |
156 | 0 | let sort_partition_points = evaluate_partition_ranges(num_rows, &columns)?; |
157 | 0 | evaluator.evaluate_all_with_rank(num_rows, &sort_partition_points) |
158 | | } else { |
159 | 0 | let values = self.evaluate_args(batch)?; |
160 | 0 | evaluator.evaluate_all(&values, num_rows) |
161 | | } |
162 | 0 | } |
163 | | |
164 | | /// Evaluate the window function against the batch. This function facilitates |
165 | | /// stateful, bounded-memory implementations. |
166 | 12 | fn evaluate_stateful( |
167 | 12 | &self, |
168 | 12 | partition_batches: &PartitionBatches, |
169 | 12 | window_agg_state: &mut PartitionWindowAggStates, |
170 | 12 | ) -> Result<()> { |
171 | 12 | let field = self.expr.field()?0 ; |
172 | 12 | let out_type = field.data_type(); |
173 | 12 | let sort_options = self.order_by.iter().map(|o| o.options0 ).collect::<Vec<_>>(); |
174 | 12 | for (partition_row, partition_batch_state) in partition_batches.iter() { |
175 | 12 | let window_state = |
176 | 12 | if let Some(window_state9 ) = window_agg_state.get_mut(partition_row) { |
177 | 9 | window_state |
178 | | } else { |
179 | 3 | let evaluator = self.expr.create_evaluator()?0 ; |
180 | 3 | window_agg_state |
181 | 3 | .entry(partition_row.clone()) |
182 | 3 | .or_insert(WindowState { |
183 | 3 | state: WindowAggState::new(out_type)?0 , |
184 | 3 | window_fn: WindowFn::Builtin(evaluator), |
185 | | }) |
186 | | }; |
187 | 12 | let evaluator = match &mut window_state.window_fn { |
188 | 12 | WindowFn::Builtin(evaluator) => evaluator, |
189 | 0 | _ => unreachable!(), |
190 | | }; |
191 | 12 | let state = &mut window_state.state; |
192 | 12 | |
193 | 12 | let batch_ref = &partition_batch_state.record_batch; |
194 | 12 | let mut values = self.evaluate_args(batch_ref)?0 ; |
195 | 12 | let order_bys = if evaluator.uses_window_frame() || evaluator.include_rank()0 { |
196 | 12 | get_orderby_values(self.order_by_columns(batch_ref)?0 ) |
197 | | } else { |
198 | 0 | vec![] |
199 | | }; |
200 | 12 | let n_args = values.len(); |
201 | 12 | values.extend(order_bys); |
202 | 12 | let order_bys_ref = &values[n_args..]; |
203 | 12 | |
204 | 12 | // We iterate on each row to perform a running calculation. |
205 | 12 | let record_batch = &partition_batch_state.record_batch; |
206 | 12 | let num_rows = record_batch.num_rows(); |
207 | 12 | let mut row_wise_results: Vec<ScalarValue> = vec![]; |
208 | 12 | let is_causal = if evaluator.uses_window_frame() { |
209 | 12 | self.window_frame.is_causal() |
210 | | } else { |
211 | 0 | evaluator.is_causal() |
212 | | }; |
213 | 27 | for idx in state.last_calculated_index..num_rows12 { |
214 | 27 | let frame_range = if evaluator.uses_window_frame() { |
215 | 27 | state |
216 | 27 | .window_frame_ctx |
217 | 27 | .get_or_insert_with(|| { |
218 | 3 | WindowFrameContext::new( |
219 | 3 | Arc::clone(&self.window_frame), |
220 | 3 | sort_options.clone(), |
221 | 3 | ) |
222 | 27 | }) |
223 | 27 | .calculate_range( |
224 | 27 | order_bys_ref, |
225 | 27 | // Start search from the last range |
226 | 27 | &state.window_frame_range, |
227 | 27 | num_rows, |
228 | 27 | idx, |
229 | 27 | ) |
230 | | } else { |
231 | 0 | evaluator.get_range(idx, num_rows) |
232 | 0 | }?; |
233 | | |
234 | | // Exit if the range is non-causal and extends all the way: |
235 | 27 | if frame_range.end == num_rows |
236 | 9 | && !is_causal |
237 | 0 | && !partition_batch_state.is_end |
238 | | { |
239 | 0 | break; |
240 | 27 | } |
241 | 27 | // Update last range |
242 | 27 | state.window_frame_range = frame_range; |
243 | 27 | row_wise_results |
244 | 27 | .push(evaluator.evaluate(&values, &state.window_frame_range)?0 ); |
245 | | } |
246 | 12 | let out_col = if row_wise_results.is_empty() { |
247 | 3 | new_empty_array(out_type) |
248 | | } else { |
249 | 9 | ScalarValue::iter_to_array(row_wise_results.into_iter())?0 |
250 | | }; |
251 | | |
252 | 12 | state.update(&out_col, partition_batch_state)?0 ; |
253 | 12 | if self.window_frame.start_bound.is_unbounded() { |
254 | 12 | evaluator.memoize(state)?0 ; |
255 | 0 | } |
256 | | } |
257 | 12 | Ok(()) |
258 | 12 | } |
259 | | |
260 | 3 | fn get_window_frame(&self) -> &Arc<WindowFrame> { |
261 | 3 | &self.window_frame |
262 | 3 | } |
263 | | |
264 | 0 | fn get_reverse_expr(&self) -> Option<Arc<dyn WindowExpr>> { |
265 | 0 | self.expr.reverse_expr().map(|reverse_expr| { |
266 | 0 | Arc::new(BuiltInWindowExpr::new( |
267 | 0 | reverse_expr, |
268 | 0 | &self.partition_by.clone(), |
269 | 0 | &reverse_order_bys(&self.order_by), |
270 | 0 | Arc::new(self.window_frame.reverse()), |
271 | 0 | )) as _ |
272 | 0 | }) |
273 | 0 | } |
274 | | |
275 | 0 | fn uses_bounded_memory(&self) -> bool { |
276 | 0 | if let Ok(evaluator) = self.expr.create_evaluator() { |
277 | 0 | evaluator.supports_bounded_execution() |
278 | 0 | && (!evaluator.uses_window_frame() |
279 | 0 | || !self.window_frame.end_bound.is_unbounded()) |
280 | | } else { |
281 | 0 | false |
282 | | } |
283 | 0 | } |
284 | | } |