Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/windows/window_agg_exec.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Stream and channel implementations for window function expressions.
19
20
use std::any::Any;
21
use std::pin::Pin;
22
use std::sync::Arc;
23
use std::task::{Context, Poll};
24
25
use super::utils::create_schema;
26
use crate::expressions::PhysicalSortExpr;
27
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
28
use crate::windows::{
29
    calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs,
30
    window_equivalence_properties,
31
};
32
use crate::{
33
    ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionMode,
34
    ExecutionPlan, ExecutionPlanProperties, PhysicalExpr, PlanProperties,
35
    RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr,
36
};
37
use arrow::array::ArrayRef;
38
use arrow::compute::{concat, concat_batches};
39
use arrow::datatypes::SchemaRef;
40
use arrow::error::ArrowError;
41
use arrow::record_batch::RecordBatch;
42
use datafusion_common::stats::Precision;
43
use datafusion_common::utils::{evaluate_partition_ranges, transpose};
44
use datafusion_common::{internal_err, Result};
45
use datafusion_execution::TaskContext;
46
use datafusion_physical_expr_common::sort_expr::LexRequirement;
47
use futures::{ready, Stream, StreamExt};
48
49
/// Window execution plan
50
#[derive(Debug)]
51
pub struct WindowAggExec {
52
    /// Input plan
53
    pub(crate) input: Arc<dyn ExecutionPlan>,
54
    /// Window function expression
55
    window_expr: Vec<Arc<dyn WindowExpr>>,
56
    /// Schema after the window is run
57
    schema: SchemaRef,
58
    /// Partition Keys
59
    pub partition_keys: Vec<Arc<dyn PhysicalExpr>>,
60
    /// Execution metrics
61
    metrics: ExecutionPlanMetricsSet,
62
    /// Partition by indices that defines preset for existing ordering
63
    // see `get_ordered_partition_by_indices` for more details.
64
    ordered_partition_by_indices: Vec<usize>,
65
    /// Cache holding plan properties like equivalences, output partitioning etc.
66
    cache: PlanProperties,
67
}
68
69
impl WindowAggExec {
70
    /// Create a new execution plan for window aggregates
71
1
    pub fn try_new(
72
1
        window_expr: Vec<Arc<dyn WindowExpr>>,
73
1
        input: Arc<dyn ExecutionPlan>,
74
1
        partition_keys: Vec<Arc<dyn PhysicalExpr>>,
75
1
    ) -> Result<Self> {
76
1
        let schema = create_schema(&input.schema(), &window_expr)
?0
;
77
1
        let schema = Arc::new(schema);
78
1
79
1
        let ordered_partition_by_indices =
80
1
            get_ordered_partition_by_indices(window_expr[0].partition_by(), &input);
81
1
        let cache = Self::compute_properties(Arc::clone(&schema), &input, &window_expr);
82
1
        Ok(Self {
83
1
            input,
84
1
            window_expr,
85
1
            schema,
86
1
            partition_keys,
87
1
            metrics: ExecutionPlanMetricsSet::new(),
88
1
            ordered_partition_by_indices,
89
1
            cache,
90
1
        })
91
1
    }
92
93
    /// Window expressions
94
1
    pub fn window_expr(&self) -> &[Arc<dyn WindowExpr>] {
95
1
        &self.window_expr
96
1
    }
97
98
    /// Input plan
99
0
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
100
0
        &self.input
101
0
    }
102
103
    /// Return the output sort order of partition keys: For example
104
    /// OVER(PARTITION BY a, ORDER BY b) -> would give sorting of the column a
105
    // We are sure that partition by columns are always at the beginning of sort_keys
106
    // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely
107
    // to calculate partition separation points
108
1
    pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> {
109
1
        let partition_by = self.window_expr()[0].partition_by();
110
1
        get_partition_by_sort_exprs(
111
1
            &self.input,
112
1
            partition_by,
113
1
            &self.ordered_partition_by_indices,
114
1
        )
115
1
    }
116
117
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
118
1
    fn compute_properties(
119
1
        schema: SchemaRef,
120
1
        input: &Arc<dyn ExecutionPlan>,
121
1
        window_expr: &[Arc<dyn WindowExpr>],
122
1
    ) -> PlanProperties {
123
1
        // Calculate equivalence properties:
124
1
        let eq_properties = window_equivalence_properties(&schema, input, window_expr);
125
1
126
1
        // Get output partitioning:
127
1
        // Because we can have repartitioning using the partition keys this
128
1
        // would be either 1 or more than 1 depending on the presence of repartitioning.
129
1
        let output_partitioning = input.output_partitioning().clone();
130
131
        // Determine execution mode:
132
1
        let mode = match input.execution_mode() {
133
1
            ExecutionMode::Bounded => ExecutionMode::Bounded,
134
            ExecutionMode::Unbounded | ExecutionMode::PipelineBreaking => {
135
0
                ExecutionMode::PipelineBreaking
136
            }
137
        };
138
139
        // Construct properties cache:
140
1
        PlanProperties::new(eq_properties, output_partitioning, mode)
141
1
    }
142
}
143
144
impl DisplayAs for WindowAggExec {
145
0
    fn fmt_as(
146
0
        &self,
147
0
        t: DisplayFormatType,
148
0
        f: &mut std::fmt::Formatter,
149
0
    ) -> std::fmt::Result {
150
0
        match t {
151
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
152
0
                write!(f, "WindowAggExec: ")?;
153
0
                let g: Vec<String> = self
154
0
                    .window_expr
155
0
                    .iter()
156
0
                    .map(|e| {
157
0
                        format!(
158
0
                            "{}: {:?}, frame: {:?}",
159
0
                            e.name().to_owned(),
160
0
                            e.field(),
161
0
                            e.get_window_frame()
162
0
                        )
163
0
                    })
164
0
                    .collect();
165
0
                write!(f, "wdw=[{}]", g.join(", "))?;
166
            }
167
        }
168
0
        Ok(())
169
0
    }
170
}
171
172
impl ExecutionPlan for WindowAggExec {
173
0
    fn name(&self) -> &'static str {
174
0
        "WindowAggExec"
175
0
    }
176
177
    /// Return a reference to Any that can be used for downcasting
178
0
    fn as_any(&self) -> &dyn Any {
179
0
        self
180
0
    }
181
182
1
    fn properties(&self) -> &PlanProperties {
183
1
        &self.cache
184
1
    }
185
186
0
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
187
0
        vec![&self.input]
188
0
    }
189
190
0
    fn maintains_input_order(&self) -> Vec<bool> {
191
0
        vec![true]
192
0
    }
193
194
0
    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
195
0
        let partition_bys = self.window_expr()[0].partition_by();
196
0
        let order_keys = self.window_expr()[0].order_by();
197
0
        if self.ordered_partition_by_indices.len() < partition_bys.len() {
198
0
            vec![calc_requirements(partition_bys, order_keys)]
199
        } else {
200
0
            let partition_bys = self
201
0
                .ordered_partition_by_indices
202
0
                .iter()
203
0
                .map(|idx| &partition_bys[*idx]);
204
0
            vec![calc_requirements(partition_bys, order_keys)]
205
        }
206
0
    }
207
208
0
    fn required_input_distribution(&self) -> Vec<Distribution> {
209
0
        if self.partition_keys.is_empty() {
210
0
            vec![Distribution::SinglePartition]
211
        } else {
212
0
            vec![Distribution::HashPartitioned(self.partition_keys.clone())]
213
        }
214
0
    }
215
216
0
    fn with_new_children(
217
0
        self: Arc<Self>,
218
0
        children: Vec<Arc<dyn ExecutionPlan>>,
219
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
220
0
        Ok(Arc::new(WindowAggExec::try_new(
221
0
            self.window_expr.clone(),
222
0
            Arc::clone(&children[0]),
223
0
            self.partition_keys.clone(),
224
0
        )?))
225
0
    }
226
227
1
    fn execute(
228
1
        &self,
229
1
        partition: usize,
230
1
        context: Arc<TaskContext>,
231
1
    ) -> Result<SendableRecordBatchStream> {
232
1
        let input = self.input.execute(partition, context)
?0
;
233
1
        let stream = Box::pin(WindowAggStream::new(
234
1
            Arc::clone(&self.schema),
235
1
            self.window_expr.clone(),
236
1
            input,
237
1
            BaselineMetrics::new(&self.metrics, partition),
238
1
            self.partition_by_sort_keys()
?0
,
239
1
            self.ordered_partition_by_indices.clone(),
240
0
        )?);
241
1
        Ok(stream)
242
1
    }
243
244
0
    fn metrics(&self) -> Option<MetricsSet> {
245
0
        Some(self.metrics.clone_inner())
246
0
    }
247
248
0
    fn statistics(&self) -> Result<Statistics> {
249
0
        let input_stat = self.input.statistics()?;
250
0
        let win_cols = self.window_expr.len();
251
0
        let input_cols = self.input.schema().fields().len();
252
0
        // TODO stats: some windowing function will maintain invariants such as min, max...
253
0
        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
254
0
        // copy stats of the input to the beginning of the schema.
255
0
        column_statistics.extend(input_stat.column_statistics);
256
0
        for _ in 0..win_cols {
257
0
            column_statistics.push(ColumnStatistics::new_unknown())
258
        }
259
0
        Ok(Statistics {
260
0
            num_rows: input_stat.num_rows,
261
0
            column_statistics,
262
0
            total_byte_size: Precision::Absent,
263
0
        })
264
0
    }
265
}
266
267
/// Compute the window aggregate columns
268
0
fn compute_window_aggregates(
269
0
    window_expr: &[Arc<dyn WindowExpr>],
270
0
    batch: &RecordBatch,
271
0
) -> Result<Vec<ArrayRef>> {
272
0
    window_expr
273
0
        .iter()
274
0
        .map(|window_expr| window_expr.evaluate(batch))
275
0
        .collect()
276
0
}
277
278
/// stream for window aggregation plan
279
pub struct WindowAggStream {
280
    schema: SchemaRef,
281
    input: SendableRecordBatchStream,
282
    batches: Vec<RecordBatch>,
283
    finished: bool,
284
    window_expr: Vec<Arc<dyn WindowExpr>>,
285
    partition_by_sort_keys: Vec<PhysicalSortExpr>,
286
    baseline_metrics: BaselineMetrics,
287
    ordered_partition_by_indices: Vec<usize>,
288
}
289
290
impl WindowAggStream {
291
    /// Create a new WindowAggStream
292
1
    pub fn new(
293
1
        schema: SchemaRef,
294
1
        window_expr: Vec<Arc<dyn WindowExpr>>,
295
1
        input: SendableRecordBatchStream,
296
1
        baseline_metrics: BaselineMetrics,
297
1
        partition_by_sort_keys: Vec<PhysicalSortExpr>,
298
1
        ordered_partition_by_indices: Vec<usize>,
299
1
    ) -> Result<Self> {
300
1
        // In WindowAggExec all partition by columns should be ordered.
301
1
        if window_expr[0].partition_by().len() != ordered_partition_by_indices.len() {
302
0
            return internal_err!("All partition by columns should have an ordering");
303
1
        }
304
1
        Ok(Self {
305
1
            schema,
306
1
            input,
307
1
            batches: vec![],
308
1
            finished: false,
309
1
            window_expr,
310
1
            baseline_metrics,
311
1
            partition_by_sort_keys,
312
1
            ordered_partition_by_indices,
313
1
        })
314
1
    }
315
316
0
    fn compute_aggregates(&self) -> Result<RecordBatch> {
317
0
        // record compute time on drop
318
0
        let _timer = self.baseline_metrics.elapsed_compute().timer();
319
0
        let batch = concat_batches(&self.input.schema(), &self.batches)?;
320
0
        if batch.num_rows() == 0 {
321
0
            return Ok(RecordBatch::new_empty(Arc::clone(&self.schema)));
322
0
        }
323
324
0
        let partition_by_sort_keys = self
325
0
            .ordered_partition_by_indices
326
0
            .iter()
327
0
            .map(|idx| self.partition_by_sort_keys[*idx].evaluate_to_sort_column(&batch))
328
0
            .collect::<Result<Vec<_>>>()?;
329
0
        let partition_points =
330
0
            evaluate_partition_ranges(batch.num_rows(), &partition_by_sort_keys)?;
331
332
0
        let mut partition_results = vec![];
333
        // Calculate window cols
334
0
        for partition_point in partition_points {
335
0
            let length = partition_point.end - partition_point.start;
336
0
            partition_results.push(compute_window_aggregates(
337
0
                &self.window_expr,
338
0
                &batch.slice(partition_point.start, length),
339
0
            )?)
340
        }
341
0
        let columns = transpose(partition_results)
342
0
            .iter()
343
0
            .map(|elems| concat(&elems.iter().map(|x| x.as_ref()).collect::<Vec<_>>()))
344
0
            .collect::<Vec<_>>()
345
0
            .into_iter()
346
0
            .collect::<Result<Vec<ArrayRef>, ArrowError>>()?;
347
348
        // combine with the original cols
349
        // note the setup of window aggregates is that they newly calculated window
350
        // expression results are always appended to the columns
351
0
        let mut batch_columns = batch.columns().to_vec();
352
0
        // calculate window cols
353
0
        batch_columns.extend_from_slice(&columns);
354
0
        Ok(RecordBatch::try_new(
355
0
            Arc::clone(&self.schema),
356
0
            batch_columns,
357
0
        )?)
358
0
    }
359
}
360
361
impl Stream for WindowAggStream {
362
    type Item = Result<RecordBatch>;
363
364
1
    fn poll_next(
365
1
        mut self: Pin<&mut Self>,
366
1
        cx: &mut Context<'_>,
367
1
    ) -> Poll<Option<Self::Item>> {
368
1
        let poll = self.poll_next_inner(cx);
369
1
        self.baseline_metrics.record_poll(poll)
370
1
    }
371
}
372
373
impl WindowAggStream {
374
    #[inline]
375
1
    fn poll_next_inner(
376
1
        &mut self,
377
1
        cx: &mut Context<'_>,
378
1
    ) -> Poll<Option<Result<RecordBatch>>> {
379
1
        if self.finished {
380
0
            return Poll::Ready(None);
381
1
        }
382
383
        loop {
384
1
            let 
result0
= match ready!(self.input.poll_next_unpin(cx)) {
385
0
                Some(Ok(batch)) => {
386
0
                    self.batches.push(batch);
387
0
                    continue;
388
                }
389
0
                Some(Err(e)) => Err(e),
390
0
                None => self.compute_aggregates(),
391
            };
392
393
0
            self.finished = true;
394
0
395
0
            return Poll::Ready(Some(result));
396
        }
397
1
    }
398
}
399
400
impl RecordBatchStream for WindowAggStream {
401
    /// Get the schema
402
0
    fn schema(&self) -> SchemaRef {
403
0
        Arc::clone(&self.schema)
404
0
    }
405
}