/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/windows/window_agg_exec.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Stream and channel implementations for window function expressions. |
19 | | |
20 | | use std::any::Any; |
21 | | use std::pin::Pin; |
22 | | use std::sync::Arc; |
23 | | use std::task::{Context, Poll}; |
24 | | |
25 | | use super::utils::create_schema; |
26 | | use crate::expressions::PhysicalSortExpr; |
27 | | use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; |
28 | | use crate::windows::{ |
29 | | calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs, |
30 | | window_equivalence_properties, |
31 | | }; |
32 | | use crate::{ |
33 | | ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode, |
34 | | ExecutionPlan, ExecutionPlanProperties, PhysicalExpr, PlanProperties, |
35 | | RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr, |
36 | | }; |
37 | | use arrow::array::ArrayRef; |
38 | | use arrow::compute::{concat, concat_batches}; |
39 | | use arrow::datatypes::SchemaRef; |
40 | | use arrow::error::ArrowError; |
41 | | use arrow::record_batch::RecordBatch; |
42 | | use datafusion_common::stats::Precision; |
43 | | use datafusion_common::utils::{evaluate_partition_ranges, transpose}; |
44 | | use datafusion_common::{internal_err, Result}; |
45 | | use datafusion_execution::TaskContext; |
46 | | use datafusion_physical_expr_common::sort_expr::LexRequirement; |
47 | | use futures::{ready, Stream, StreamExt}; |
48 | | |
49 | | /// Window execution plan |
50 | | #[derive(Debug)] |
51 | | pub struct WindowAggExec { |
52 | | /// Input plan |
53 | | pub(crate) input: Arc<dyn ExecutionPlan>, |
54 | | /// Window function expression |
55 | | window_expr: Vec<Arc<dyn WindowExpr>>, |
56 | | /// Schema after the window is run |
57 | | schema: SchemaRef, |
58 | | /// Partition Keys |
59 | | pub partition_keys: Vec<Arc<dyn PhysicalExpr>>, |
60 | | /// Execution metrics |
61 | | metrics: ExecutionPlanMetricsSet, |
62 | | /// Partition by indices that defines preset for existing ordering |
63 | | // see `get_ordered_partition_by_indices` for more details. |
64 | | ordered_partition_by_indices: Vec<usize>, |
65 | | /// Cache holding plan properties like equivalences, output partitioning etc. |
66 | | cache: PlanProperties, |
67 | | } |
68 | | |
69 | | impl WindowAggExec { |
70 | | /// Create a new execution plan for window aggregates |
71 | 1 | pub fn try_new( |
72 | 1 | window_expr: Vec<Arc<dyn WindowExpr>>, |
73 | 1 | input: Arc<dyn ExecutionPlan>, |
74 | 1 | partition_keys: Vec<Arc<dyn PhysicalExpr>>, |
75 | 1 | ) -> Result<Self> { |
76 | 1 | let schema = create_schema(&input.schema(), &window_expr)?0 ; |
77 | 1 | let schema = Arc::new(schema); |
78 | 1 | |
79 | 1 | let ordered_partition_by_indices = |
80 | 1 | get_ordered_partition_by_indices(window_expr[0].partition_by(), &input); |
81 | 1 | let cache = Self::compute_properties(Arc::clone(&schema), &input, &window_expr); |
82 | 1 | Ok(Self { |
83 | 1 | input, |
84 | 1 | window_expr, |
85 | 1 | schema, |
86 | 1 | partition_keys, |
87 | 1 | metrics: ExecutionPlanMetricsSet::new(), |
88 | 1 | ordered_partition_by_indices, |
89 | 1 | cache, |
90 | 1 | }) |
91 | 1 | } |
92 | | |
93 | | /// Window expressions |
94 | 1 | pub fn window_expr(&self) -> &[Arc<dyn WindowExpr>] { |
95 | 1 | &self.window_expr |
96 | 1 | } |
97 | | |
98 | | /// Input plan |
99 | 0 | pub fn input(&self) -> &Arc<dyn ExecutionPlan> { |
100 | 0 | &self.input |
101 | 0 | } |
102 | | |
103 | | /// Return the output sort order of partition keys: For example |
104 | | /// OVER(PARTITION BY a, ORDER BY b) -> would give sorting of the column a |
105 | | // We are sure that partition by columns are always at the beginning of sort_keys |
106 | | // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely |
107 | | // to calculate partition separation points |
108 | 1 | pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> { |
109 | 1 | let partition_by = self.window_expr()[0].partition_by(); |
110 | 1 | get_partition_by_sort_exprs( |
111 | 1 | &self.input, |
112 | 1 | partition_by, |
113 | 1 | &self.ordered_partition_by_indices, |
114 | 1 | ) |
115 | 1 | } |
116 | | |
117 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
118 | 1 | fn compute_properties( |
119 | 1 | schema: SchemaRef, |
120 | 1 | input: &Arc<dyn ExecutionPlan>, |
121 | 1 | window_expr: &[Arc<dyn WindowExpr>], |
122 | 1 | ) -> PlanProperties { |
123 | 1 | // Calculate equivalence properties: |
124 | 1 | let eq_properties = window_equivalence_properties(&schema, input, window_expr); |
125 | 1 | |
126 | 1 | // Get output partitioning: |
127 | 1 | // Because we can have repartitioning using the partition keys this |
128 | 1 | // would be either 1 or more than 1 depending on the presence of repartitioning. |
129 | 1 | let output_partitioning = input.output_partitioning().clone(); |
130 | | |
131 | | // Determine execution mode: |
132 | 1 | let mode = match input.execution_mode() { |
133 | 1 | ExecutionMode::Bounded => ExecutionMode::Bounded, |
134 | | ExecutionMode::Unbounded | ExecutionMode::PipelineBreaking => { |
135 | 0 | ExecutionMode::PipelineBreaking |
136 | | } |
137 | | }; |
138 | | |
139 | | // Construct properties cache: |
140 | 1 | PlanProperties::new(eq_properties, output_partitioning, mode) |
141 | 1 | } |
142 | | } |
143 | | |
144 | | impl DisplayAs for WindowAggExec { |
145 | 0 | fn fmt_as( |
146 | 0 | &self, |
147 | 0 | t: DisplayFormatType, |
148 | 0 | f: &mut std::fmt::Formatter, |
149 | 0 | ) -> std::fmt::Result { |
150 | 0 | match t { |
151 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
152 | 0 | write!(f, "WindowAggExec: ")?; |
153 | 0 | let g: Vec<String> = self |
154 | 0 | .window_expr |
155 | 0 | .iter() |
156 | 0 | .map(|e| { |
157 | 0 | format!( |
158 | 0 | "{}: {:?}, frame: {:?}", |
159 | 0 | e.name().to_owned(), |
160 | 0 | e.field(), |
161 | 0 | e.get_window_frame() |
162 | 0 | ) |
163 | 0 | }) |
164 | 0 | .collect(); |
165 | 0 | write!(f, "wdw=[{}]", g.join(", "))?; |
166 | | } |
167 | | } |
168 | 0 | Ok(()) |
169 | 0 | } |
170 | | } |
171 | | |
172 | | impl ExecutionPlan for WindowAggExec { |
173 | 0 | fn name(&self) -> &'static str { |
174 | 0 | "WindowAggExec" |
175 | 0 | } |
176 | | |
177 | | /// Return a reference to Any that can be used for downcasting |
178 | 0 | fn as_any(&self) -> &dyn Any { |
179 | 0 | self |
180 | 0 | } |
181 | | |
182 | 1 | fn properties(&self) -> &PlanProperties { |
183 | 1 | &self.cache |
184 | 1 | } |
185 | | |
186 | 0 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
187 | 0 | vec![&self.input] |
188 | 0 | } |
189 | | |
190 | 0 | fn maintains_input_order(&self) -> Vec<bool> { |
191 | 0 | vec![true] |
192 | 0 | } |
193 | | |
194 | 0 | fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> { |
195 | 0 | let partition_bys = self.window_expr()[0].partition_by(); |
196 | 0 | let order_keys = self.window_expr()[0].order_by(); |
197 | 0 | if self.ordered_partition_by_indices.len() < partition_bys.len() { |
198 | 0 | vec![calc_requirements(partition_bys, order_keys)] |
199 | | } else { |
200 | 0 | let partition_bys = self |
201 | 0 | .ordered_partition_by_indices |
202 | 0 | .iter() |
203 | 0 | .map(|idx| &partition_bys[*idx]); |
204 | 0 | vec![calc_requirements(partition_bys, order_keys)] |
205 | | } |
206 | 0 | } |
207 | | |
208 | 0 | fn required_input_distribution(&self) -> Vec<Distribution> { |
209 | 0 | if self.partition_keys.is_empty() { |
210 | 0 | vec![Distribution::SinglePartition] |
211 | | } else { |
212 | 0 | vec![Distribution::HashPartitioned(self.partition_keys.clone())] |
213 | | } |
214 | 0 | } |
215 | | |
216 | 0 | fn with_new_children( |
217 | 0 | self: Arc<Self>, |
218 | 0 | children: Vec<Arc<dyn ExecutionPlan>>, |
219 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
220 | 0 | Ok(Arc::new(WindowAggExec::try_new( |
221 | 0 | self.window_expr.clone(), |
222 | 0 | Arc::clone(&children[0]), |
223 | 0 | self.partition_keys.clone(), |
224 | 0 | )?)) |
225 | 0 | } |
226 | | |
227 | 1 | fn execute( |
228 | 1 | &self, |
229 | 1 | partition: usize, |
230 | 1 | context: Arc<TaskContext>, |
231 | 1 | ) -> Result<SendableRecordBatchStream> { |
232 | 1 | let input = self.input.execute(partition, context)?0 ; |
233 | 1 | let stream = Box::pin(WindowAggStream::new( |
234 | 1 | Arc::clone(&self.schema), |
235 | 1 | self.window_expr.clone(), |
236 | 1 | input, |
237 | 1 | BaselineMetrics::new(&self.metrics, partition), |
238 | 1 | self.partition_by_sort_keys()?0 , |
239 | 1 | self.ordered_partition_by_indices.clone(), |
240 | 0 | )?); |
241 | 1 | Ok(stream) |
242 | 1 | } |
243 | | |
244 | 0 | fn metrics(&self) -> Option<MetricsSet> { |
245 | 0 | Some(self.metrics.clone_inner()) |
246 | 0 | } |
247 | | |
248 | 0 | fn statistics(&self) -> Result<Statistics> { |
249 | 0 | let input_stat = self.input.statistics()?; |
250 | 0 | let win_cols = self.window_expr.len(); |
251 | 0 | let input_cols = self.input.schema().fields().len(); |
252 | 0 | // TODO stats: some windowing function will maintain invariants such as min, max... |
253 | 0 | let mut column_statistics = Vec::with_capacity(win_cols + input_cols); |
254 | 0 | // copy stats of the input to the beginning of the schema. |
255 | 0 | column_statistics.extend(input_stat.column_statistics); |
256 | 0 | for _ in 0..win_cols { |
257 | 0 | column_statistics.push(ColumnStatistics::new_unknown()) |
258 | | } |
259 | 0 | Ok(Statistics { |
260 | 0 | num_rows: input_stat.num_rows, |
261 | 0 | column_statistics, |
262 | 0 | total_byte_size: Precision::Absent, |
263 | 0 | }) |
264 | 0 | } |
265 | | } |
266 | | |
267 | | /// Compute the window aggregate columns |
268 | 0 | fn compute_window_aggregates( |
269 | 0 | window_expr: &[Arc<dyn WindowExpr>], |
270 | 0 | batch: &RecordBatch, |
271 | 0 | ) -> Result<Vec<ArrayRef>> { |
272 | 0 | window_expr |
273 | 0 | .iter() |
274 | 0 | .map(|window_expr| window_expr.evaluate(batch)) |
275 | 0 | .collect() |
276 | 0 | } |
277 | | |
278 | | /// stream for window aggregation plan |
279 | | pub struct WindowAggStream { |
280 | | schema: SchemaRef, |
281 | | input: SendableRecordBatchStream, |
282 | | batches: Vec<RecordBatch>, |
283 | | finished: bool, |
284 | | window_expr: Vec<Arc<dyn WindowExpr>>, |
285 | | partition_by_sort_keys: Vec<PhysicalSortExpr>, |
286 | | baseline_metrics: BaselineMetrics, |
287 | | ordered_partition_by_indices: Vec<usize>, |
288 | | } |
289 | | |
290 | | impl WindowAggStream { |
291 | | /// Create a new WindowAggStream |
292 | 1 | pub fn new( |
293 | 1 | schema: SchemaRef, |
294 | 1 | window_expr: Vec<Arc<dyn WindowExpr>>, |
295 | 1 | input: SendableRecordBatchStream, |
296 | 1 | baseline_metrics: BaselineMetrics, |
297 | 1 | partition_by_sort_keys: Vec<PhysicalSortExpr>, |
298 | 1 | ordered_partition_by_indices: Vec<usize>, |
299 | 1 | ) -> Result<Self> { |
300 | 1 | // In WindowAggExec all partition by columns should be ordered. |
301 | 1 | if window_expr[0].partition_by().len() != ordered_partition_by_indices.len() { |
302 | 0 | return internal_err!("All partition by columns should have an ordering"); |
303 | 1 | } |
304 | 1 | Ok(Self { |
305 | 1 | schema, |
306 | 1 | input, |
307 | 1 | batches: vec![], |
308 | 1 | finished: false, |
309 | 1 | window_expr, |
310 | 1 | baseline_metrics, |
311 | 1 | partition_by_sort_keys, |
312 | 1 | ordered_partition_by_indices, |
313 | 1 | }) |
314 | 1 | } |
315 | | |
316 | 0 | fn compute_aggregates(&self) -> Result<RecordBatch> { |
317 | 0 | // record compute time on drop |
318 | 0 | let _timer = self.baseline_metrics.elapsed_compute().timer(); |
319 | 0 | let batch = concat_batches(&self.input.schema(), &self.batches)?; |
320 | 0 | if batch.num_rows() == 0 { |
321 | 0 | return Ok(RecordBatch::new_empty(Arc::clone(&self.schema))); |
322 | 0 | } |
323 | | |
324 | 0 | let partition_by_sort_keys = self |
325 | 0 | .ordered_partition_by_indices |
326 | 0 | .iter() |
327 | 0 | .map(|idx| self.partition_by_sort_keys[*idx].evaluate_to_sort_column(&batch)) |
328 | 0 | .collect::<Result<Vec<_>>>()?; |
329 | 0 | let partition_points = |
330 | 0 | evaluate_partition_ranges(batch.num_rows(), &partition_by_sort_keys)?; |
331 | | |
332 | 0 | let mut partition_results = vec![]; |
333 | | // Calculate window cols |
334 | 0 | for partition_point in partition_points { |
335 | 0 | let length = partition_point.end - partition_point.start; |
336 | 0 | partition_results.push(compute_window_aggregates( |
337 | 0 | &self.window_expr, |
338 | 0 | &batch.slice(partition_point.start, length), |
339 | 0 | )?) |
340 | | } |
341 | 0 | let columns = transpose(partition_results) |
342 | 0 | .iter() |
343 | 0 | .map(|elems| concat(&elems.iter().map(|x| x.as_ref()).collect::<Vec<_>>())) |
344 | 0 | .collect::<Vec<_>>() |
345 | 0 | .into_iter() |
346 | 0 | .collect::<Result<Vec<ArrayRef>, ArrowError>>()?; |
347 | | |
348 | | // combine with the original cols |
349 | | // note the setup of window aggregates is that they newly calculated window |
350 | | // expression results are always appended to the columns |
351 | 0 | let mut batch_columns = batch.columns().to_vec(); |
352 | 0 | // calculate window cols |
353 | 0 | batch_columns.extend_from_slice(&columns); |
354 | 0 | Ok(RecordBatch::try_new( |
355 | 0 | Arc::clone(&self.schema), |
356 | 0 | batch_columns, |
357 | 0 | )?) |
358 | 0 | } |
359 | | } |
360 | | |
361 | | impl Stream for WindowAggStream { |
362 | | type Item = Result<RecordBatch>; |
363 | | |
364 | 1 | fn poll_next( |
365 | 1 | mut self: Pin<&mut Self>, |
366 | 1 | cx: &mut Context<'_>, |
367 | 1 | ) -> Poll<Option<Self::Item>> { |
368 | 1 | let poll = self.poll_next_inner(cx); |
369 | 1 | self.baseline_metrics.record_poll(poll) |
370 | 1 | } |
371 | | } |
372 | | |
373 | | impl WindowAggStream { |
374 | | #[inline] |
375 | 1 | fn poll_next_inner( |
376 | 1 | &mut self, |
377 | 1 | cx: &mut Context<'_>, |
378 | 1 | ) -> Poll<Option<Result<RecordBatch>>> { |
379 | 1 | if self.finished { |
380 | 0 | return Poll::Ready(None); |
381 | 1 | } |
382 | | |
383 | | loop { |
384 | 1 | let result0 = match ready!(self.input.poll_next_unpin(cx)) { |
385 | 0 | Some(Ok(batch)) => { |
386 | 0 | self.batches.push(batch); |
387 | 0 | continue; |
388 | | } |
389 | 0 | Some(Err(e)) => Err(e), |
390 | 0 | None => self.compute_aggregates(), |
391 | | }; |
392 | | |
393 | 0 | self.finished = true; |
394 | 0 |
|
395 | 0 | return Poll::Ready(Some(result)); |
396 | | } |
397 | 1 | } |
398 | | } |
399 | | |
400 | | impl RecordBatchStream for WindowAggStream { |
401 | | /// Get the schema |
402 | 0 | fn schema(&self) -> SchemaRef { |
403 | 0 | Arc::clone(&self.schema) |
404 | 0 | } |
405 | | } |