Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Stream and channel implementations for window function expressions.
19
//! The executor given here uses bounded memory (does not maintain all
20
//! the input data seen so far), which makes it appropriate when processing
21
//! infinite inputs.
22
23
use std::any::Any;
24
use std::cmp::{min, Ordering};
25
use std::collections::{HashMap, VecDeque};
26
use std::pin::Pin;
27
use std::sync::Arc;
28
use std::task::{Context, Poll};
29
30
use super::utils::create_schema;
31
use crate::expressions::PhysicalSortExpr;
32
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
33
use crate::windows::{
34
    calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs,
35
    window_equivalence_properties,
36
};
37
use crate::{
38
    ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan,
39
    ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream,
40
    SendableRecordBatchStream, Statistics, WindowExpr,
41
};
42
use ahash::RandomState;
43
use arrow::{
44
    array::{Array, ArrayRef, RecordBatchOptions, UInt32Builder},
45
    compute::{concat, concat_batches, sort_to_indices},
46
    datatypes::SchemaRef,
47
    record_batch::RecordBatch,
48
};
49
use datafusion_common::hash_utils::create_hashes;
50
use datafusion_common::stats::Precision;
51
use datafusion_common::utils::{
52
    evaluate_partition_ranges, get_at_indices, get_record_batch_at_indices,
53
    get_row_at_idx, take_arrays,
54
};
55
use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result};
56
use datafusion_execution::TaskContext;
57
use datafusion_expr::window_state::{PartitionBatchState, WindowAggState};
58
use datafusion_expr::ColumnarValue;
59
use datafusion_physical_expr::window::{
60
    PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState,
61
};
62
use datafusion_physical_expr::PhysicalExpr;
63
use datafusion_physical_expr_common::sort_expr::LexRequirement;
64
use futures::stream::Stream;
65
use futures::{ready, StreamExt};
66
use hashbrown::raw::RawTable;
67
use indexmap::IndexMap;
68
use log::debug;
69
70
/// Window execution plan
71
#[derive(Debug)]
72
pub struct BoundedWindowAggExec {
73
    /// Input plan
74
    input: Arc<dyn ExecutionPlan>,
75
    /// Window function expression
76
    window_expr: Vec<Arc<dyn WindowExpr>>,
77
    /// Schema after the window is run
78
    schema: SchemaRef,
79
    /// Partition Keys
80
    pub partition_keys: Vec<Arc<dyn PhysicalExpr>>,
81
    /// Execution metrics
82
    metrics: ExecutionPlanMetricsSet,
83
    /// Describes how the input is ordered relative to the partition keys
84
    pub input_order_mode: InputOrderMode,
85
    /// Partition by indices that define ordering
86
    // For example, if input ordering is ORDER BY a, b and window expression
87
    // contains PARTITION BY b, a; `ordered_partition_by_indices` would be 1, 0.
88
    // Similarly, if window expression contains PARTITION BY a, b; then
89
    // `ordered_partition_by_indices` would be 0, 1.
90
    // See `get_ordered_partition_by_indices` for more details.
91
    ordered_partition_by_indices: Vec<usize>,
92
    /// Cache holding plan properties like equivalences, output partitioning etc.
93
    cache: PlanProperties,
94
}
95
96
impl BoundedWindowAggExec {
97
    /// Create a new execution plan for window aggregates
98
2
    pub fn try_new(
99
2
        window_expr: Vec<Arc<dyn WindowExpr>>,
100
2
        input: Arc<dyn ExecutionPlan>,
101
2
        partition_keys: Vec<Arc<dyn PhysicalExpr>>,
102
2
        input_order_mode: InputOrderMode,
103
2
    ) -> Result<Self> {
104
2
        let schema = create_schema(&input.schema(), &window_expr)
?0
;
105
2
        let schema = Arc::new(schema);
106
2
        let partition_by_exprs = window_expr[0].partition_by();
107
2
        let ordered_partition_by_indices = match &input_order_mode {
108
            InputOrderMode::Sorted => {
109
1
                let indices = get_ordered_partition_by_indices(
110
1
                    window_expr[0].partition_by(),
111
1
                    &input,
112
1
                );
113
1
                if indices.len() == partition_by_exprs.len() {
114
1
                    indices
115
                } else {
116
0
                    (0..partition_by_exprs.len()).collect::<Vec<_>>()
117
                }
118
            }
119
0
            InputOrderMode::PartiallySorted(ordered_indices) => ordered_indices.clone(),
120
            InputOrderMode::Linear => {
121
1
                vec![]
122
            }
123
        };
124
2
        let cache = Self::compute_properties(&input, &schema, &window_expr);
125
2
        Ok(Self {
126
2
            input,
127
2
            window_expr,
128
2
            schema,
129
2
            partition_keys,
130
2
            metrics: ExecutionPlanMetricsSet::new(),
131
2
            input_order_mode,
132
2
            ordered_partition_by_indices,
133
2
            cache,
134
2
        })
135
2
    }
136
137
    /// Window expressions
138
3
    pub fn window_expr(&self) -> &[Arc<dyn WindowExpr>] {
139
3
        &self.window_expr
140
3
    }
141
142
    /// Input plan
143
2
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
144
2
        &self.input
145
2
    }
146
147
    /// Return the output sort order of partition keys: For example
148
    /// OVER(PARTITION BY a, ORDER BY b) -> would give sorting of the column a
149
    // We are sure that partition by columns are always at the beginning of sort_keys
150
    // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely
151
    // to calculate partition separation points
152
2
    pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> {
153
2
        let partition_by = self.window_expr()[0].partition_by();
154
2
        get_partition_by_sort_exprs(
155
2
            &self.input,
156
2
            partition_by,
157
2
            &self.ordered_partition_by_indices,
158
2
        )
159
2
    }
160
161
    /// Initializes the appropriate [`PartitionSearcher`] implementation from
162
    /// the state.
163
2
    fn get_search_algo(&self) -> Result<Box<dyn PartitionSearcher>> {
164
2
        let partition_by_sort_keys = self.partition_by_sort_keys()
?0
;
165
2
        let ordered_partition_by_indices = self.ordered_partition_by_indices.clone();
166
2
        let input_schema = self.input().schema();
167
2
        Ok(match &self.input_order_mode {
168
            InputOrderMode::Sorted => {
169
                // In Sorted mode, all partition by columns should be ordered.
170
1
                if self.window_expr()[0].partition_by().len()
171
1
                    != ordered_partition_by_indices.len()
172
                {
173
0
                    return exec_err!("All partition by columns should have an ordering in Sorted mode.");
174
1
                }
175
1
                Box::new(SortedSearch {
176
1
                    partition_by_sort_keys,
177
1
                    ordered_partition_by_indices,
178
1
                    input_schema,
179
1
                })
180
            }
181
1
            InputOrderMode::Linear | InputOrderMode::PartiallySorted(_) => Box::new(
182
1
                LinearSearch::new(ordered_partition_by_indices, input_schema),
183
1
            ),
184
        })
185
2
    }
186
187
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
188
2
    fn compute_properties(
189
2
        input: &Arc<dyn ExecutionPlan>,
190
2
        schema: &SchemaRef,
191
2
        window_expr: &[Arc<dyn WindowExpr>],
192
2
    ) -> PlanProperties {
193
2
        // Calculate equivalence properties:
194
2
        let eq_properties = window_equivalence_properties(schema, input, window_expr);
195
2
196
2
        // As we can have repartitioning using the partition keys, this can
197
2
        // be either one or more than one, depending on the presence of
198
2
        // repartitioning.
199
2
        let output_partitioning = input.output_partitioning().clone();
200
2
201
2
        // Construct properties cache
202
2
        PlanProperties::new(
203
2
            eq_properties,          // Equivalence Properties
204
2
            output_partitioning,    // Output Partitioning
205
2
            input.execution_mode(), // Execution Mode
206
2
        )
207
2
    }
208
}
209
210
impl DisplayAs for BoundedWindowAggExec {
211
2
    fn fmt_as(
212
2
        &self,
213
2
        t: DisplayFormatType,
214
2
        f: &mut std::fmt::Formatter,
215
2
    ) -> std::fmt::Result {
216
2
        match t {
217
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
218
2
                write!(f, "BoundedWindowAggExec: ")
?0
;
219
2
                let g: Vec<String> = self
220
2
                    .window_expr
221
2
                    .iter()
222
4
                    .map(|e| {
223
4
                        format!(
224
4
                            "{}: {:?}, frame: {:?}",
225
4
                            e.name().to_owned(),
226
4
                            e.field(),
227
4
                            e.get_window_frame()
228
4
                        )
229
4
                    })
230
2
                    .collect();
231
2
                let mode = &self.input_order_mode;
232
2
                write!(f, "wdw=[{}], mode=[{:?}]", g.join(", "), mode)
?0
;
233
            }
234
        }
235
2
        Ok(())
236
2
    }
237
}
238
239
impl ExecutionPlan for BoundedWindowAggExec {
240
0
    fn name(&self) -> &'static str {
241
0
        "BoundedWindowAggExec"
242
0
    }
243
244
    /// Return a reference to Any that can be used for downcasting
245
0
    fn as_any(&self) -> &dyn Any {
246
0
        self
247
0
    }
248
249
6
    fn properties(&self) -> &PlanProperties {
250
6
        &self.cache
251
6
    }
252
253
2
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
254
2
        vec![&self.input]
255
2
    }
256
257
0
    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
258
0
        let partition_bys = self.window_expr()[0].partition_by();
259
0
        let order_keys = self.window_expr()[0].order_by();
260
0
        let partition_bys = self
261
0
            .ordered_partition_by_indices
262
0
            .iter()
263
0
            .map(|idx| &partition_bys[*idx]);
264
0
        vec![calc_requirements(partition_bys, order_keys)]
265
0
    }
266
267
0
    fn required_input_distribution(&self) -> Vec<Distribution> {
268
0
        if self.partition_keys.is_empty() {
269
0
            debug!("No partition defined for BoundedWindowAggExec!!!");
270
0
            vec![Distribution::SinglePartition]
271
        } else {
272
0
            vec![Distribution::HashPartitioned(self.partition_keys.clone())]
273
        }
274
0
    }
275
276
0
    fn maintains_input_order(&self) -> Vec<bool> {
277
0
        vec![true]
278
0
    }
279
280
0
    fn with_new_children(
281
0
        self: Arc<Self>,
282
0
        children: Vec<Arc<dyn ExecutionPlan>>,
283
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
284
0
        Ok(Arc::new(BoundedWindowAggExec::try_new(
285
0
            self.window_expr.clone(),
286
0
            Arc::clone(&children[0]),
287
0
            self.partition_keys.clone(),
288
0
            self.input_order_mode.clone(),
289
0
        )?))
290
0
    }
291
292
2
    fn execute(
293
2
        &self,
294
2
        partition: usize,
295
2
        context: Arc<TaskContext>,
296
2
    ) -> Result<SendableRecordBatchStream> {
297
2
        let input = self.input.execute(partition, context)
?0
;
298
2
        let search_mode = self.get_search_algo()
?0
;
299
2
        let stream = Box::pin(BoundedWindowAggStream::new(
300
2
            Arc::clone(&self.schema),
301
2
            self.window_expr.clone(),
302
2
            input,
303
2
            BaselineMetrics::new(&self.metrics, partition),
304
2
            search_mode,
305
2
        )
?0
);
306
2
        Ok(stream)
307
2
    }
308
309
0
    fn metrics(&self) -> Option<MetricsSet> {
310
0
        Some(self.metrics.clone_inner())
311
0
    }
312
313
0
    fn statistics(&self) -> Result<Statistics> {
314
0
        let input_stat = self.input.statistics()?;
315
0
        let win_cols = self.window_expr.len();
316
0
        let input_cols = self.input.schema().fields().len();
317
0
        // TODO stats: some windowing function will maintain invariants such as min, max...
318
0
        let mut column_statistics = Vec::with_capacity(win_cols + input_cols);
319
0
        // copy stats of the input to the beginning of the schema.
320
0
        column_statistics.extend(input_stat.column_statistics);
321
0
        for _ in 0..win_cols {
322
0
            column_statistics.push(ColumnStatistics::new_unknown())
323
        }
324
0
        Ok(Statistics {
325
0
            num_rows: input_stat.num_rows,
326
0
            column_statistics,
327
0
            total_byte_size: Precision::Absent,
328
0
        })
329
0
    }
330
}
331
332
/// Trait that specifies how we search for (or calculate) partitions. It has two
333
/// implementations: [`SortedSearch`] and [`LinearSearch`].
334
trait PartitionSearcher: Send {
335
    /// This method constructs output columns using the result of each window expression
336
    /// (each entry in the output vector comes from a window expression).
337
    /// Executor when producing output concatenates `input_buffer` (corresponding section), and
338
    /// result of this function to generate output `RecordBatch`. `input_buffer` is used to determine
339
    /// which sections of the window expression results should be used to generate output.
340
    /// `partition_buffers` contains corresponding section of the `RecordBatch` for each partition.
341
    /// `window_agg_states` stores per partition state for each window expression.
342
    /// None case means that no result is generated
343
    /// `Some(Vec<ArrayRef>)` is the result of each window expression.
344
    fn calculate_out_columns(
345
        &mut self,
346
        input_buffer: &RecordBatch,
347
        window_agg_states: &[PartitionWindowAggStates],
348
        partition_buffers: &mut PartitionBatches,
349
        window_expr: &[Arc<dyn WindowExpr>],
350
    ) -> Result<Option<Vec<ArrayRef>>>;
351
352
    /// Determine whether `[InputOrderMode]` is `[InputOrderMode::Linear]` or not.
353
3
    fn is_mode_linear(&self) -> bool {
354
3
        false
355
3
    }
356
357
    // Constructs corresponding batches for each partition for the record_batch.
358
    fn evaluate_partition_batches(
359
        &mut self,
360
        record_batch: &RecordBatch,
361
        window_expr: &[Arc<dyn WindowExpr>],
362
    ) -> Result<Vec<(PartitionKey, RecordBatch)>>;
363
364
    /// Prunes the state.
365
3
    fn prune(&mut self, _n_out: usize) {}
366
367
    /// Marks the partition as done if we are sure that corresponding partition
368
    /// cannot receive any more values.
369
    fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches);
370
371
    /// Updates `input_buffer` and `partition_buffers` with the new `record_batch`.
372
8
    fn update_partition_batch(
373
8
        &mut self,
374
8
        input_buffer: &mut RecordBatch,
375
8
        record_batch: RecordBatch,
376
8
        window_expr: &[Arc<dyn WindowExpr>],
377
8
        partition_buffers: &mut PartitionBatches,
378
8
    ) -> Result<()> {
379
8
        if record_batch.num_rows() == 0 {
380
0
            return Ok(());
381
8
        }
382
8
        let partition_batches =
383
8
            self.evaluate_partition_batches(&record_batch, window_expr)
?0
;
384
16
        for (
partition_row, partition_batch8
) in partition_batches {
385
8
            let partition_batch_state = partition_buffers
386
8
                .entry(partition_row)
387
8
                // Use input_schema for the buffer schema, not `record_batch.schema()`
388
8
                // as it may not have the "correct" schema in terms of output
389
8
                // nullability constraints. For details, see the following issue:
390
8
                // https://github.com/apache/datafusion/issues/9320
391
8
                .or_insert_with(|| {
392
4
                    PartitionBatchState::new(Arc::clone(self.input_schema()))
393
8
                });
394
8
            partition_batch_state.extend(&partition_batch)
?0
;
395
        }
396
397
8
        if self.is_mode_linear() {
398
            // In `Linear` mode, it is guaranteed that the first ORDER BY column
399
            // is sorted across partitions. Note that only the first ORDER BY
400
            // column is guaranteed to be ordered. As a counter example, consider
401
            // the case, `PARTITION BY b, ORDER BY a, c` when the input is sorted
402
            // by `[a, b, c]`. In this case, `BoundedWindowAggExec` mode will be
403
            // `Linear`. However, we cannot guarantee that the last row of the
404
            // input data will be the "last" data in terms of the ordering requirement
405
            // `[a, c]` -- it will be the "last" data in terms of `[a, b, c]`.
406
            // Hence, only column `a` should be used as a guarantee of the "last"
407
            // data across partitions. For other modes (`Sorted`, `PartiallySorted`),
408
            // we do not need to keep track of the most recent row guarantee across
409
            // partitions. Since leading ordering separates partitions, guaranteed
410
            // by the most recent row, already prune the previous partitions completely.
411
5
            let last_row = get_last_row_batch(&record_batch)
?0
;
412
9
            for (_, partition_batch) in 
partition_buffers.iter_mut()5
{
413
9
                partition_batch.set_most_recent_row(last_row.clone());
414
9
            }
415
3
        }
416
8
        self.mark_partition_end(partition_buffers);
417
418
8
        *input_buffer = if input_buffer.num_rows() == 0 {
419
4
            record_batch
420
        } else {
421
4
            concat_batches(self.input_schema(), [input_buffer, &record_batch])
?0
422
        };
423
424
8
        Ok(())
425
8
    }
426
427
    fn input_schema(&self) -> &SchemaRef;
428
}
429
430
/// This object encapsulates the algorithm state for a simple linear scan
431
/// algorithm for computing partitions.
432
pub struct LinearSearch {
433
    /// Keeps the hash of input buffer calculated from PARTITION BY columns.
434
    /// Its length is equal to the `input_buffer` length.
435
    input_buffer_hashes: VecDeque<u64>,
436
    /// Used during hash value calculation.
437
    random_state: RandomState,
438
    /// Input ordering and partition by key ordering need not be the same, so
439
    /// this vector stores the mapping between them. For instance, if the input
440
    /// is ordered by a, b and the window expression contains a PARTITION BY b, a
441
    /// clause, this attribute stores [1, 0].
442
    ordered_partition_by_indices: Vec<usize>,
443
    /// We use this [`RawTable`] to calculate unique partitions for each new
444
    /// RecordBatch. First entry in the tuple is the hash value, the second
445
    /// entry is the unique ID for each partition (increments from 0 to n).
446
    row_map_batch: RawTable<(u64, usize)>,
447
    /// We use this [`RawTable`] to calculate the output columns that we can
448
    /// produce at each cycle. First entry in the tuple is the hash value, the
449
    /// second entry is the unique ID for each partition (increments from 0 to n).
450
    /// The third entry stores how many new outputs are calculated for the
451
    /// corresponding partition.
452
    row_map_out: RawTable<(u64, usize, usize)>,
453
    input_schema: SchemaRef,
454
}
455
456
impl PartitionSearcher for LinearSearch {
457
    /// This method constructs output columns using the result of each window expression.
458
    // Assume input buffer is         |      Partition Buffers would be (Where each partition and its data is seperated)
459
    // a, 2                           |      a, 2
460
    // b, 2                           |      a, 2
461
    // a, 2                           |      a, 2
462
    // b, 2                           |
463
    // a, 2                           |      b, 2
464
    // b, 2                           |      b, 2
465
    // b, 2                           |      b, 2
466
    //                                |      b, 2
467
    // Also assume we happen to calculate 2 new values for a, and 3 for b (To be calculate missing values we may need to consider future values).
468
    // Partition buffers effectively will be
469
    // a, 2, 1
470
    // a, 2, 2
471
    // a, 2, (missing)
472
    //
473
    // b, 2, 1
474
    // b, 2, 2
475
    // b, 2, 3
476
    // b, 2, (missing)
477
    // When partition buffers are mapped back to the original record batch. Result becomes
478
    // a, 2, 1
479
    // b, 2, 1
480
    // a, 2, 2
481
    // b, 2, 2
482
    // a, 2, (missing)
483
    // b, 2, 3
484
    // b, 2, (missing)
485
    // This function calculates the column result of window expression(s) (First 4 entry of 3rd column in the above section.)
486
    // 1
487
    // 1
488
    // 2
489
    // 2
490
    // Above section corresponds to calculated result which can be emitted without breaking input buffer ordering.
491
5
    fn calculate_out_columns(
492
5
        &mut self,
493
5
        input_buffer: &RecordBatch,
494
5
        window_agg_states: &[PartitionWindowAggStates],
495
5
        partition_buffers: &mut PartitionBatches,
496
5
        window_expr: &[Arc<dyn WindowExpr>],
497
5
    ) -> Result<Option<Vec<ArrayRef>>> {
498
5
        let partition_output_indices = self.calc_partition_output_indices(
499
5
            input_buffer,
500
5
            window_agg_states,
501
5
            window_expr,
502
5
        )
?0
;
503
504
5
        let n_window_col = window_agg_states.len();
505
5
        let mut new_columns = vec![vec![]; n_window_col];
506
5
        // Size of all_indices can be at most input_buffer.num_rows():
507
5
        let mut all_indices = UInt32Builder::with_capacity(input_buffer.num_rows());
508
9
        for (
row, indices4
) in partition_output_indices {
509
4
            let length = indices.len();
510
4
            for (idx, window_agg_state) in window_agg_states.iter().enumerate() {
511
4
                let partition = &window_agg_state[&row];
512
4
                let values = Arc::clone(&partition.state.out_col.slice(0, length));
513
4
                new_columns[idx].push(values);
514
4
            }
515
4
            let partition_batch_state = &mut partition_buffers[&row];
516
4
            // Store how many rows are generated for each partition
517
4
            partition_batch_state.n_out_row = length;
518
4
            // For each row keep corresponding index in the input record batch
519
4
            all_indices.append_slice(&indices);
520
        }
521
5
        let all_indices = all_indices.finish();
522
5
        if all_indices.is_empty() {
523
            // We couldn't generate any new value, return early:
524
1
            return Ok(None);
525
4
        }
526
527
        // Concatenate results for each column by converting `Vec<Vec<ArrayRef>>`
528
        // to Vec<ArrayRef> where inner `Vec<ArrayRef>`s are converted to `ArrayRef`s.
529
4
        let new_columns = new_columns
530
4
            .iter()
531
4
            .map(|items| {
532
4
                concat(&items.iter().map(|e| e.as_ref()).collect::<Vec<_>>())
533
4
                    .map_err(|e| 
arrow_datafusion_err!(e)0
)
534
4
            })
535
4
            .collect::<Result<Vec<_>>>()
?0
;
536
        // We should emit columns according to row index ordering.
537
4
        let sorted_indices = sort_to_indices(&all_indices, None, None)
?0
;
538
        // Construct new column according to row ordering. This fixes ordering
539
4
        take_arrays(&new_columns, &sorted_indices).map(Some)
540
5
    }
541
542
5
    fn evaluate_partition_batches(
543
5
        &mut self,
544
5
        record_batch: &RecordBatch,
545
5
        window_expr: &[Arc<dyn WindowExpr>],
546
5
    ) -> Result<Vec<(PartitionKey, RecordBatch)>> {
547
5
        let partition_bys =
548
5
            evaluate_partition_by_column_values(record_batch, window_expr)
?0
;
549
        // NOTE: In Linear or PartiallySorted modes, we are sure that
550
        //       `partition_bys` are not empty.
551
        // Calculate indices for each partition and construct a new record
552
        // batch from the rows at these indices for each partition:
553
5
        self.get_per_partition_indices(&partition_bys, record_batch)
?0
554
5
            .into_iter()
555
5
            .map(|(row, indices)| {
556
5
                let mut new_indices = UInt32Builder::with_capacity(indices.len());
557
5
                new_indices.append_slice(&indices);
558
5
                let indices = new_indices.finish();
559
5
                Ok((row, get_record_batch_at_indices(record_batch, &indices)
?0
))
560
5
            })
561
5
            .collect()
562
5
    }
563
564
4
    fn prune(&mut self, n_out: usize) {
565
4
        // Delete hashes for the rows that are outputted.
566
4
        self.input_buffer_hashes.drain(0..n_out);
567
4
    }
568
569
5
    fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) {
570
5
        // We should be in the `PartiallySorted` case, otherwise we can not
571
5
        // tell when we are at the end of a given partition.
572
5
        if !self.ordered_partition_by_indices.is_empty() {
573
0
            if let Some((last_row, _)) = partition_buffers.last() {
574
0
                let last_sorted_cols = self
575
0
                    .ordered_partition_by_indices
576
0
                    .iter()
577
0
                    .map(|idx| last_row[*idx].clone())
578
0
                    .collect::<Vec<_>>();
579
0
                for (row, partition_batch_state) in partition_buffers.iter_mut() {
580
0
                    let sorted_cols = self
581
0
                        .ordered_partition_by_indices
582
0
                        .iter()
583
0
                        .map(|idx| &row[*idx]);
584
0
                    // All the partitions other than `last_sorted_cols` are done.
585
0
                    // We are sure that we will no longer receive values for these
586
0
                    // partitions (arrival of a new value would violate ordering).
587
0
                    partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols);
588
0
                }
589
0
            }
590
5
        }
591
5
    }
592
593
5
    fn is_mode_linear(&self) -> bool {
594
5
        self.ordered_partition_by_indices.is_empty()
595
5
    }
596
597
7
    fn input_schema(&self) -> &SchemaRef {
598
7
        &self.input_schema
599
7
    }
600
}
601
602
impl LinearSearch {
603
    /// Initialize a new [`LinearSearch`] partition searcher.
604
1
    fn new(ordered_partition_by_indices: Vec<usize>, input_schema: SchemaRef) -> Self {
605
1
        LinearSearch {
606
1
            input_buffer_hashes: VecDeque::new(),
607
1
            random_state: Default::default(),
608
1
            ordered_partition_by_indices,
609
1
            row_map_batch: RawTable::with_capacity(256),
610
1
            row_map_out: RawTable::with_capacity(256),
611
1
            input_schema,
612
1
        }
613
1
    }
614
615
    /// Calculate indices of each partition (according to PARTITION BY expression)
616
    /// `columns` contain partition by expression results.
617
5
    fn get_per_partition_indices(
618
5
        &mut self,
619
5
        columns: &[ArrayRef],
620
5
        batch: &RecordBatch,
621
5
    ) -> Result<Vec<(PartitionKey, Vec<u32>)>> {
622
5
        let mut batch_hashes = vec![0; batch.num_rows()];
623
5
        create_hashes(columns, &self.random_state, &mut batch_hashes)
?0
;
624
5
        self.input_buffer_hashes.extend(&batch_hashes);
625
5
        // reset row_map for new calculation
626
5
        self.row_map_batch.clear();
627
5
        // res stores PartitionKey and row indices (indices where these partition occurs in the `batch`) for each partition.
628
5
        let mut result: Vec<(PartitionKey, Vec<u32>)> = vec![];
629
10
        for (hash, row_idx) in 
batch_hashes.into_iter().zip(0u32..)5
{
630
10
            let entry = self.row_map_batch.get_mut(hash, |(_, group_idx)| {
631
5
                // We can safely get the first index of the partition indices
632
5
                // since partition indices has one element during initialization.
633
5
                let row = get_row_at_idx(columns, row_idx as usize).unwrap();
634
5
                // Handle hash collusions with an equality check:
635
5
                row.eq(&result[*group_idx].0)
636
10
            });
637
10
            if let Some((_, 
group_idx5
)) = entry {
638
5
                result[*group_idx].1.push(row_idx)
639
            } else {
640
5
                self.row_map_batch
641
5
                    .insert(hash, (hash, result.len()), |(hash, _)| 
*hash0
);
642
5
                let row = get_row_at_idx(columns, row_idx as usize)
?0
;
643
                // This is a new partition its only index is row_idx for now.
644
5
                result.push((row, vec![row_idx]));
645
            }
646
        }
647
5
        Ok(result)
648
5
    }
649
650
    /// Calculates partition keys and result indices for each partition.
651
    /// The return value is a vector of tuples where the first entry stores
652
    /// the partition key (unique for each partition) and the second entry
653
    /// stores indices of the rows for which the partition is constructed.
654
5
    fn calc_partition_output_indices(
655
5
        &mut self,
656
5
        input_buffer: &RecordBatch,
657
5
        window_agg_states: &[PartitionWindowAggStates],
658
5
        window_expr: &[Arc<dyn WindowExpr>],
659
5
    ) -> Result<Vec<(PartitionKey, Vec<u32>)>> {
660
5
        let partition_by_columns =
661
5
            evaluate_partition_by_column_values(input_buffer, window_expr)
?0
;
662
        // Reset the row_map state:
663
5
        self.row_map_out.clear();
664
5
        let mut partition_indices: Vec<(PartitionKey, Vec<u32>)> = vec![];
665
13
        for (hash, row_idx) in 
self.input_buffer_hashes.iter().zip(0u32..)5
{
666
13
            let entry = self.row_map_out.get_mut(*hash, |(_, group_idx, _)| {
667
6
                let row =
668
6
                    get_row_at_idx(&partition_by_columns, row_idx as usize).unwrap();
669
6
                row == partition_indices[*group_idx].0
670
13
            });
671
13
            if let Some((_, 
group_idx, n_out6
)) = entry {
672
6
                let (_, indices) = &mut partition_indices[*group_idx];
673
6
                if indices.len() >= *n_out {
674
2
                    break;
675
4
                }
676
4
                indices.push(row_idx);
677
            } else {
678
7
                let row = get_row_at_idx(&partition_by_columns, row_idx as usize)
?0
;
679
7
                let min_out = window_agg_states
680
7
                    .iter()
681
7
                    .map(|window_agg_state| {
682
7
                        window_agg_state
683
7
                            .get(&row)
684
7
                            .map(|partition| partition.state.out_col.len())
685
7
                            .unwrap_or(0)
686
7
                    })
687
7
                    .min()
688
7
                    .unwrap_or(0);
689
7
                if min_out == 0 {
690
3
                    break;
691
4
                }
692
4
                self.row_map_out.insert(
693
4
                    *hash,
694
4
                    (*hash, partition_indices.len(), min_out),
695
4
                    |(hash, _, _)| 
*hash0
,
696
4
                );
697
4
                partition_indices.push((row, vec![row_idx]));
698
4
            }
699
        }
700
5
        Ok(partition_indices)
701
5
    }
702
}
703
704
/// This object encapsulates the algorithm state for sorted searching
705
/// when computing partitions.
706
pub struct SortedSearch {
707
    /// Stores partition by columns and their ordering information
708
    partition_by_sort_keys: Vec<PhysicalSortExpr>,
709
    /// Input ordering and partition by key ordering need not be the same, so
710
    /// this vector stores the mapping between them. For instance, if the input
711
    /// is ordered by a, b and the window expression contains a PARTITION BY b, a
712
    /// clause, this attribute stores [1, 0].
713
    ordered_partition_by_indices: Vec<usize>,
714
    input_schema: SchemaRef,
715
}
716
717
impl PartitionSearcher for SortedSearch {
718
    /// This method constructs new output columns using the result of each window expression.
719
4
    fn calculate_out_columns(
720
4
        &mut self,
721
4
        _input_buffer: &RecordBatch,
722
4
        window_agg_states: &[PartitionWindowAggStates],
723
4
        partition_buffers: &mut PartitionBatches,
724
4
        _window_expr: &[Arc<dyn WindowExpr>],
725
4
    ) -> Result<Option<Vec<ArrayRef>>> {
726
4
        let n_out = self.calculate_n_out_row(window_agg_states, partition_buffers);
727
4
        if n_out == 0 {
728
1
            Ok(None)
729
        } else {
730
3
            window_agg_states
731
3
                .iter()
732
9
                .map(|map| get_aggregate_result_out_column(map, n_out).map(Some))
733
3
                .collect()
734
        }
735
4
    }
736
737
3
    fn evaluate_partition_batches(
738
3
        &mut self,
739
3
        record_batch: &RecordBatch,
740
3
        _window_expr: &[Arc<dyn WindowExpr>],
741
3
    ) -> Result<Vec<(PartitionKey, RecordBatch)>> {
742
3
        let num_rows = record_batch.num_rows();
743
        // Calculate result of partition by column expressions
744
3
        let partition_columns = self
745
3
            .partition_by_sort_keys
746
3
            .iter()
747
3
            .map(|elem| 
elem.evaluate_to_sort_column(record_batch)0
)
748
3
            .collect::<Result<Vec<_>>>()
?0
;
749
        // Reorder `partition_columns` such that its ordering matches input ordering.
750
3
        let partition_columns_ordered =
751
3
            get_at_indices(&partition_columns, &self.ordered_partition_by_indices)
?0
;
752
3
        let partition_points =
753
3
            evaluate_partition_ranges(num_rows, &partition_columns_ordered)
?0
;
754
3
        let partition_bys = partition_columns
755
3
            .into_iter()
756
3
            .map(|arr| 
arr.values0
)
757
3
            .collect::<Vec<ArrayRef>>();
758
3
759
3
        partition_points
760
3
            .iter()
761
3
            .map(|range| {
762
3
                let row = get_row_at_idx(&partition_bys, range.start)
?0
;
763
3
                let len = range.end - range.start;
764
3
                let slice = record_batch.slice(range.start, len);
765
3
                Ok((row, slice))
766
3
            })
767
3
            .collect::<Result<Vec<_>>>()
768
3
    }
769
770
3
    fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) {
771
3
        // In Sorted case. We can mark all partitions besides last partition as ended.
772
3
        // We are sure that those partitions will never receive any values.
773
3
        // (Otherwise ordering invariant is violated.)
774
3
        let n_partitions = partition_buffers.len();
775
3
        for (idx, (_, partition_batch_state)) in partition_buffers.iter_mut().enumerate()
776
3
        {
777
3
            partition_batch_state.is_end |= idx < n_partitions - 1;
778
3
        }
779
3
    }
780
781
1
    fn input_schema(&self) -> &SchemaRef {
782
1
        &self.input_schema
783
1
    }
784
}
785
786
impl SortedSearch {
787
    /// Calculates how many rows we can output.
788
4
    fn calculate_n_out_row(
789
4
        &mut self,
790
4
        window_agg_states: &[PartitionWindowAggStates],
791
4
        partition_buffers: &mut PartitionBatches,
792
4
    ) -> usize {
793
4
        // Different window aggregators may produce results at different rates.
794
4
        // We produce the overall batch result only as fast as the slowest one.
795
4
        let mut counts = vec![];
796
12
        let out_col_counts = window_agg_states.iter().map(|window_agg_state| {
797
12
            // Store how many elements are generated for the current
798
12
            // window expression:
799
12
            let mut cur_window_expr_out_result_len = 0;
800
12
            // We iterate over `window_agg_state`, which is an IndexMap.
801
12
            // Iterations follow the insertion order, hence we preserve
802
12
            // sorting when partition columns are sorted.
803
12
            let mut per_partition_out_results = HashMap::new();
804
12
            for (row, WindowState { state, .. }) in window_agg_state.iter() {
805
12
                cur_window_expr_out_result_len += state.out_col.len();
806
12
                let count = per_partition_out_results.entry(row).or_insert(0);
807
12
                if *count < state.out_col.len() {
808
9
                    *count = state.out_col.len();
809
9
                }
3
810
                // If we do not generate all results for the current
811
                // partition, we do not generate results for next
812
                // partition --  otherwise we will lose input ordering.
813
12
                if state.n_row_result_missing > 0 {
814
0
                    break;
815
12
                }
816
            }
817
12
            counts.push(per_partition_out_results);
818
12
            cur_window_expr_out_result_len
819
12
        });
820
4
        argmin(out_col_counts).map_or(0, |(min_idx, minima)| {
821
4
            for (row, count) in counts.swap_remove(min_idx).into_iter() {
822
4
                let partition_batch = &mut partition_buffers[row];
823
4
                partition_batch.n_out_row = count;
824
4
            }
825
4
            minima
826
4
        })
827
4
    }
828
}
829
830
/// Calculates partition by expression results for each window expression
831
/// on `record_batch`.
832
10
fn evaluate_partition_by_column_values(
833
10
    record_batch: &RecordBatch,
834
10
    window_expr: &[Arc<dyn WindowExpr>],
835
10
) -> Result<Vec<ArrayRef>> {
836
10
    window_expr[0]
837
10
        .partition_by()
838
10
        .iter()
839
10
        .map(|item| match item.evaluate(record_batch)
?0
{
840
10
            ColumnarValue::Array(array) => Ok(array),
841
0
            ColumnarValue::Scalar(scalar) => {
842
0
                scalar.to_array_of_size(record_batch.num_rows())
843
            }
844
10
        })
845
10
        .collect()
846
10
}
847
848
/// Stream for the bounded window aggregation plan.
849
pub struct BoundedWindowAggStream {
850
    schema: SchemaRef,
851
    input: SendableRecordBatchStream,
852
    /// The record batch executor receives as input (i.e. the columns needed
853
    /// while calculating aggregation results).
854
    input_buffer: RecordBatch,
855
    /// We separate `input_buffer` based on partitions (as
856
    /// determined by PARTITION BY columns) and store them per partition
857
    /// in `partition_batches`. We use this variable when calculating results
858
    /// for each window expression. This enables us to use the same batch for
859
    /// different window expressions without copying.
860
    // Note that we could keep record batches for each window expression in
861
    // `PartitionWindowAggStates`. However, this would use more memory (as
862
    // many times as the number of window expressions).
863
    partition_buffers: PartitionBatches,
864
    /// An executor can run multiple window expressions if the PARTITION BY
865
    /// and ORDER BY sections are same. We keep state of the each window
866
    /// expression inside `window_agg_states`.
867
    window_agg_states: Vec<PartitionWindowAggStates>,
868
    finished: bool,
869
    window_expr: Vec<Arc<dyn WindowExpr>>,
870
    baseline_metrics: BaselineMetrics,
871
    /// Search mode for partition columns. This determines the algorithm with
872
    /// which we group each partition.
873
    search_mode: Box<dyn PartitionSearcher>,
874
}
875
876
impl BoundedWindowAggStream {
877
    /// Prunes sections of the state that are no longer needed when calculating
878
    /// results (as determined by window frame boundaries and number of results generated).
879
    // For instance, if first `n` (not necessarily same with `n_out`) elements are no longer needed to
880
    // calculate window expression result (outside the window frame boundary) we retract first `n` elements
881
    // from `self.partition_batches` in corresponding partition.
882
    // For instance, if `n_out` number of rows are calculated, we can remove
883
    // first `n_out` rows from `self.input_buffer`.
884
7
    fn prune_state(&mut self, n_out: usize) -> Result<()> {
885
7
        // Prune `self.window_agg_states`:
886
7
        self.prune_out_columns();
887
7
        // Prune `self.partition_batches`:
888
7
        self.prune_partition_batches();
889
7
        // Prune `self.input_buffer`:
890
7
        self.prune_input_batch(n_out)
?0
;
891
        // Prune internal state of search algorithm.
892
7
        self.search_mode.prune(n_out);
893
7
        Ok(())
894
7
    }
895
}
896
897
impl Stream for BoundedWindowAggStream {
898
    type Item = Result<RecordBatch>;
899
900
564
    fn poll_next(
901
564
        mut self: Pin<&mut Self>,
902
564
        cx: &mut Context<'_>,
903
564
    ) -> Poll<Option<Self::Item>> {
904
564
        let poll = self.poll_next_inner(cx);
905
564
        self.baseline_metrics.record_poll(poll)
906
564
    }
907
}
908
909
impl BoundedWindowAggStream {
910
    /// Create a new BoundedWindowAggStream
911
2
    fn new(
912
2
        schema: SchemaRef,
913
2
        window_expr: Vec<Arc<dyn WindowExpr>>,
914
2
        input: SendableRecordBatchStream,
915
2
        baseline_metrics: BaselineMetrics,
916
2
        search_mode: Box<dyn PartitionSearcher>,
917
2
    ) -> Result<Self> {
918
4
        let state = window_expr.iter().map(|_| IndexMap::new()).collect();
919
2
        let empty_batch = RecordBatch::new_empty(Arc::clone(&schema));
920
2
        Ok(Self {
921
2
            schema,
922
2
            input,
923
2
            input_buffer: empty_batch,
924
2
            partition_buffers: IndexMap::new(),
925
2
            window_agg_states: state,
926
2
            finished: false,
927
2
            window_expr,
928
2
            baseline_metrics,
929
2
            search_mode,
930
2
        })
931
2
    }
932
933
9
    fn compute_aggregates(&mut self) -> Result<RecordBatch> {
934
        // calculate window cols
935
17
        for (cur_window_expr, state) in
936
9
            self.window_expr.iter().zip(&mut self.window_agg_states)
937
        {
938
17
            cur_window_expr.evaluate_stateful(&self.partition_buffers, state)
?0
;
939
        }
940
941
9
        let schema = Arc::clone(&self.schema);
942
9
        let window_expr_out = self.search_mode.calculate_out_columns(
943
9
            &self.input_buffer,
944
9
            &self.window_agg_states,
945
9
            &mut self.partition_buffers,
946
9
            &self.window_expr,
947
9
        )
?0
;
948
9
        if let Some(
window_expr_out7
) = window_expr_out {
949
7
            let n_out = window_expr_out[0].len();
950
7
            // right append new columns to corresponding section in the original input buffer.
951
7
            let columns_to_show = self
952
7
                .input_buffer
953
7
                .columns()
954
7
                .iter()
955
11
                .map(|elem| elem.slice(0, n_out))
956
7
                .chain(window_expr_out)
957
7
                .collect::<Vec<_>>();
958
7
            let n_generated = columns_to_show[0].len();
959
7
            self.prune_state(n_generated)
?0
;
960
7
            Ok(RecordBatch::try_new(schema, columns_to_show)
?0
)
961
        } else {
962
2
            Ok(RecordBatch::new_empty(schema))
963
        }
964
9
    }
965
966
    #[inline]
967
564
    fn poll_next_inner(
968
564
        &mut self,
969
564
        cx: &mut Context<'_>,
970
564
    ) -> Poll<Option<Result<RecordBatch>>> {
971
564
        if self.finished {
972
1
            return Poll::Ready(None);
973
563
        }
974
975
563
        let 
result9
= match
ready!554
(self.input.poll_next_unpin(cx)) {
976
8
            Some(Ok(batch)) => {
977
8
                self.search_mode.update_partition_batch(
978
8
                    &mut self.input_buffer,
979
8
                    batch,
980
8
                    &self.window_expr,
981
8
                    &mut self.partition_buffers,
982
8
                )
?0
;
983
8
                self.compute_aggregates()
984
            }
985
0
            Some(Err(e)) => Err(e),
986
            None => {
987
1
                self.finished = true;
988
1
                for (_, partition_batch_state) in self.partition_buffers.iter_mut() {
989
1
                    partition_batch_state.is_end = true;
990
1
                }
991
1
                self.compute_aggregates()
992
            }
993
        };
994
9
        Poll::Ready(Some(result))
995
564
    }
996
997
    /// Prunes the sections of the record batch (for each partition)
998
    /// that we no longer need to calculate the window function result.
999
7
    fn prune_partition_batches(&mut self) {
1000
7
        // Remove partitions which we know already ended (is_end flag is true).
1001
7
        // Since the retain method preserves insertion order, we still have
1002
7
        // ordering in between partitions after removal.
1003
7
        self.partition_buffers
1004
11
            .retain(|_, partition_batch_state| !partition_batch_state.is_end);
1005
7
1006
7
        // The data in `self.partition_batches` is used by all window expressions.
1007
7
        // Therefore, when removing from `self.partition_batches`, we need to remove
1008
7
        // from the earliest range boundary among all window expressions. Variable
1009
7
        // `n_prune_each_partition` fill the earliest range boundary information for
1010
7
        // each partition. This way, we can delete the no-longer-needed sections from
1011
7
        // `self.partition_batches`.
1012
7
        // For instance, if window frame one uses [10, 20] and window frame two uses
1013
7
        // [5, 15]; we only prune the first 5 elements from the corresponding record
1014
7
        // batch in `self.partition_batches`.
1015
7
1016
7
        // Calculate how many elements to prune for each partition batch
1017
7
        let mut n_prune_each_partition = HashMap::new();
1018
13
        for window_agg_state in 
self.window_agg_states.iter_mut()7
{
1019
17
            
window_agg_state.retain(13
|_, WindowState { state, .. }| !state.is_end
)13
;
1020
30
            for (
partition_row, WindowState { state: value17
, .. }) in window_agg_state {
1021
17
                let n_prune =
1022
17
                    min(value.window_frame_range.start, value.last_calculated_index);
1023
17
                if let Some(
current6
) = n_prune_each_partition.get_mut(partition_row) {
1024
6
                    if n_prune < *current {
1025
3
                        *current = n_prune;
1026
3
                    }
1027
11
                } else {
1028
11
                    n_prune_each_partition.insert(partition_row.clone(), n_prune);
1029
11
                }
1030
            }
1031
        }
1032
1033
        // Retract no longer needed parts during window calculations from partition batch:
1034
11
        for (partition_row, n_prune) in 
n_prune_each_partition.iter()7
{
1035
11
            let pb_state = &mut self.partition_buffers[partition_row];
1036
11
1037
11
            let batch = &pb_state.record_batch;
1038
11
            pb_state.record_batch = batch.slice(*n_prune, batch.num_rows() - n_prune);
1039
11
            pb_state.n_out_row = 0;
1040
1041
            // Update state indices since we have pruned some rows from the beginning:
1042
17
            for window_agg_state in 
self.window_agg_states.iter_mut()11
{
1043
17
                window_agg_state[partition_row].state.prune_state(*n_prune);
1044
17
            }
1045
        }
1046
7
    }
1047
1048
    /// Prunes the section of the input batch whose aggregate results
1049
    /// are calculated and emitted.
1050
7
    fn prune_input_batch(&mut self, n_out: usize) -> Result<()> {
1051
7
        // Prune first n_out rows from the input_buffer
1052
7
        let n_to_keep = self.input_buffer.num_rows() - n_out;
1053
7
        let batch_to_keep = self
1054
7
            .input_buffer
1055
7
            .columns()
1056
7
            .iter()
1057
11
            .map(|elem| elem.slice(n_out, n_to_keep))
1058
7
            .collect::<Vec<_>>();
1059
7
        self.input_buffer = RecordBatch::try_new_with_options(
1060
7
            self.input_buffer.schema(),
1061
7
            batch_to_keep,
1062
7
            &RecordBatchOptions::new().with_row_count(Some(n_to_keep)),
1063
7
        )
?0
;
1064
7
        Ok(())
1065
7
    }
1066
1067
    /// Prunes emitted parts from WindowAggState `out_col` field.
1068
7
    fn prune_out_columns(&mut self) {
1069
        // We store generated columns for each window expression in the `out_col`
1070
        // field of `WindowAggState`. Given how many rows are emitted, we remove
1071
        // these sections from state.
1072
13
        for partition_window_agg_states in 
self.window_agg_states.iter_mut()7
{
1073
            // Remove `n_out` entries from the `out_col` field of `WindowAggState`.
1074
            // `n_out` is stored in `self.partition_buffers` for each partition.
1075
            // If `is_end` is set, directly remove them; this shrinks the hash map.
1076
13
            partition_window_agg_states
1077
17
                .retain(|_, partition_batch_state| !partition_batch_state.state.is_end
)13
;
1078
            for (
1079
17
                partition_key,
1080
17
                WindowState {
1081
17
                    state: WindowAggState { out_col, .. },
1082
                    ..
1083
                },
1084
30
            ) in partition_window_agg_states
1085
17
            {
1086
17
                let partition_batch = &mut self.partition_buffers[partition_key];
1087
17
                let n_to_del = partition_batch.n_out_row;
1088
17
                let n_to_keep = out_col.len() - n_to_del;
1089
17
                *out_col = out_col.slice(n_to_del, n_to_keep);
1090
17
            }
1091
        }
1092
7
    }
1093
}
1094
1095
impl RecordBatchStream for BoundedWindowAggStream {
1096
    /// Get the schema
1097
0
    fn schema(&self) -> SchemaRef {
1098
0
        Arc::clone(&self.schema)
1099
0
    }
1100
}
1101
1102
// Gets the index of minimum entry, returns None if empty.
1103
4
fn argmin<T: PartialOrd>(data: impl Iterator<Item = T>) -> Option<(usize, T)> {
1104
4
    data.enumerate()
1105
8
        .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal))
1106
4
}
1107
1108
/// Calculates the section we can show results for expression
1109
9
fn get_aggregate_result_out_column(
1110
9
    partition_window_agg_states: &PartitionWindowAggStates,
1111
9
    len_to_show: usize,
1112
9
) -> Result<ArrayRef> {
1113
9
    let mut result = None;
1114
9
    let mut running_length = 0;
1115
    // We assume that iteration order is according to insertion order
1116
    for (
1117
        _,
1118
        WindowState {
1119
9
            state: WindowAggState { out_col, .. },
1120
            ..
1121
        },
1122
18
    ) in partition_window_agg_states
1123
    {
1124
9
        if running_length < len_to_show {
1125
9
            let n_to_use = min(len_to_show - running_length, out_col.len());
1126
9
            let slice_to_use = out_col.slice(0, n_to_use);
1127
9
            result = Some(match result {
1128
0
                Some(arr) => concat(&[&arr, &slice_to_use])?,
1129
9
                None => slice_to_use,
1130
            });
1131
9
            running_length += n_to_use;
1132
        } else {
1133
0
            break;
1134
        }
1135
    }
1136
9
    if running_length != len_to_show {
1137
0
        return exec_err!(
1138
0
            "Generated row number should be {len_to_show}, it is {running_length}"
1139
0
        );
1140
9
    }
1141
9
    result
1142
9
        .ok_or_else(|| 
DataFusionError::Execution("Should contain something".to_string())0
)
1143
9
}
1144
1145
/// Constructs a batch from the last row of batch in the argument.
1146
5
pub(crate) fn get_last_row_batch(batch: &RecordBatch) -> Result<RecordBatch> {
1147
5
    if batch.num_rows() == 0 {
1148
0
        return exec_err!("Latest batch should have at least 1 row");
1149
5
    }
1150
5
    Ok(batch.slice(batch.num_rows() - 1, 1))
1151
5
}
1152
1153
#[cfg(test)]
1154
mod tests {
1155
    use std::pin::Pin;
1156
    use std::sync::Arc;
1157
    use std::task::{Context, Poll};
1158
    use std::time::Duration;
1159
1160
    use crate::common::collect;
1161
    use crate::memory::MemoryExec;
1162
    use crate::projection::ProjectionExec;
1163
    use crate::streaming::{PartitionStream, StreamingTableExec};
1164
    use crate::windows::{create_window_expr, BoundedWindowAggExec, InputOrderMode};
1165
    use crate::{execute_stream, get_plan_string, ExecutionPlan};
1166
1167
    use arrow_array::builder::{Int64Builder, UInt64Builder};
1168
    use arrow_array::RecordBatch;
1169
    use arrow_schema::{DataType, Field, Schema, SchemaRef, SortOptions};
1170
    use datafusion_common::{
1171
        assert_batches_eq, exec_datafusion_err, Result, ScalarValue,
1172
    };
1173
    use datafusion_execution::config::SessionConfig;
1174
    use datafusion_execution::{
1175
        RecordBatchStream, SendableRecordBatchStream, TaskContext,
1176
    };
1177
    use datafusion_expr::{
1178
        WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition,
1179
    };
1180
    use datafusion_functions_aggregate::count::count_udaf;
1181
    use datafusion_physical_expr::expressions::{col, Column, NthValue};
1182
    use datafusion_physical_expr::window::{
1183
        BuiltInWindowExpr, BuiltInWindowFunctionExpr,
1184
    };
1185
    use datafusion_physical_expr::{LexOrdering, PhysicalExpr, PhysicalSortExpr};
1186
1187
    use futures::future::Shared;
1188
    use futures::{pin_mut, ready, FutureExt, Stream, StreamExt};
1189
    use itertools::Itertools;
1190
    use tokio::time::timeout;
1191
1192
    #[derive(Debug, Clone)]
1193
    struct TestStreamPartition {
1194
        schema: SchemaRef,
1195
        batches: Vec<RecordBatch>,
1196
        idx: usize,
1197
        state: PolingState,
1198
        sleep_duration: Duration,
1199
        send_exit: bool,
1200
    }
1201
1202
    impl PartitionStream for TestStreamPartition {
1203
1
        fn schema(&self) -> &SchemaRef {
1204
1
            &self.schema
1205
1
        }
1206
1207
1
        fn execute(&self, _ctx: Arc<TaskContext>) -> SendableRecordBatchStream {
1208
1
            // We create an iterator from the record batches and map them into Ok values,
1209
1
            // converting the iterator into a futures::stream::Stream
1210
1
            Box::pin(self.clone())
1211
1
        }
1212
    }
1213
1214
    impl Stream for TestStreamPartition {
1215
        type Item = Result<RecordBatch>;
1216
1217
559
        fn poll_next(
1218
559
            mut self: Pin<&mut Self>,
1219
559
            cx: &mut Context<'_>,
1220
559
        ) -> Poll<Option<Self::Item>> {
1221
559
            self.poll_next_inner(cx)
1222
559
        }
1223
    }
1224
1225
    #[derive(Debug, Clone)]
1226
    enum PolingState {
1227
        Sleep(Shared<futures::future::BoxFuture<'static, ()>>),
1228
        BatchReturn,
1229
    }
1230
1231
    impl TestStreamPartition {
1232
559
        fn poll_next_inner(
1233
559
            self: &mut Pin<&mut Self>,
1234
559
            cx: &mut Context<'_>,
1235
559
        ) -> Poll<Option<Result<RecordBatch>>> {
1236
            loop {
1237
1.10k
                match &mut self.state {
1238
                    PolingState::BatchReturn => {
1239
                        // Wait for self.sleep_duration before sending any new data
1240
277
                        let f = tokio::time::sleep(self.sleep_duration).boxed().shared();
1241
277
                        self.state = PolingState::Sleep(f);
1242
5
                        let input_batch = if let Some(batch) =
1243
277
                            self.batches.clone().get(self.idx)
1244
                        {
1245
5
                            batch.clone()
1246
272
                        } else if self.send_exit {
1247
                            // Send None to signal end of data
1248
0
                            return Poll::Ready(None);
1249
                        } else {
1250
                            // Go to sleep mode
1251
272
                            let f =
1252
272
                                tokio::time::sleep(self.sleep_duration).boxed().shared();
1253
272
                            self.state = PolingState::Sleep(f);
1254
272
                            continue;
1255
                        };
1256
5
                        self.idx += 1;
1257
5
                        return Poll::Ready(Some(Ok(input_batch)));
1258
                    }
1259
830
                    PolingState::Sleep(future) => {
1260
830
                        pin_mut!(future);
1261
830
                        
ready!554
(future.poll_unpin(cx));
1262
276
                        self.state = PolingState::BatchReturn;
1263
                    }
1264
                }
1265
            }
1266
559
        }
1267
    }
1268
1269
    impl RecordBatchStream for TestStreamPartition {
1270
0
        fn schema(&self) -> SchemaRef {
1271
0
            Arc::clone(&self.schema)
1272
0
        }
1273
    }
1274
1275
1
    fn bounded_window_exec_pb_latent_range(
1276
1
        input: Arc<dyn ExecutionPlan>,
1277
1
        n_future_range: usize,
1278
1
        hash: &str,
1279
1
        order_by: &str,
1280
1
    ) -> Result<Arc<dyn ExecutionPlan>> {
1281
1
        let schema = input.schema();
1282
1
        let window_fn = WindowFunctionDefinition::AggregateUDF(count_udaf());
1283
1
        let col_expr =
1284
1
            Arc::new(Column::new(schema.fields[0].name(), 0)) as Arc<dyn PhysicalExpr>;
1285
1
        let args = vec![col_expr];
1286
1
        let partitionby_exprs = vec![col(hash, &schema)
?0
];
1287
1
        let orderby_exprs = vec![PhysicalSortExpr {
1288
1
            expr: col(order_by, &schema)
?0
,
1289
1
            options: SortOptions::default(),
1290
1
        }];
1291
1
        let window_frame = WindowFrame::new_bounds(
1292
1
            WindowFrameUnits::Range,
1293
1
            WindowFrameBound::CurrentRow,
1294
1
            WindowFrameBound::Following(ScalarValue::UInt64(Some(n_future_range as u64))),
1295
1
        );
1296
1
        let fn_name = format!(
1297
1
            "{}({:?}) PARTITION BY: [{:?}], ORDER BY: [{:?}]",
1298
1
            window_fn, args, partitionby_exprs, orderby_exprs
1299
1
        );
1300
1
        let input_order_mode = InputOrderMode::Linear;
1301
1
        Ok(Arc::new(BoundedWindowAggExec::try_new(
1302
1
            vec![create_window_expr(
1303
1
                &window_fn,
1304
1
                fn_name,
1305
1
                &args,
1306
1
                &partitionby_exprs,
1307
1
                &orderby_exprs,
1308
1
                Arc::new(window_frame),
1309
1
                &input.schema(),
1310
1
                false,
1311
1
            )
?0
],
1312
1
            input,
1313
1
            partitionby_exprs,
1314
1
            input_order_mode,
1315
0
        )?))
1316
1
    }
1317
1318
1
    fn projection_exec(input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> {
1319
1
        let schema = input.schema();
1320
1
        let exprs = input
1321
1
            .schema()
1322
1
            .fields
1323
1
            .iter()
1324
1
            .enumerate()
1325
3
            .map(|(idx, field)| {
1326
3
                let name = if field.name().len() > 20 {
1327
1
                    format!("col_{idx}")
1328
                } else {
1329
2
                    field.name().clone()
1330
                };
1331
3
                let expr = col(field.name(), &schema).unwrap();
1332
3
                (expr, name)
1333
3
            })
1334
1
            .collect::<Vec<_>>();
1335
1
        Ok(Arc::new(ProjectionExec::try_new(exprs, input)
?0
))
1336
1
    }
1337
1338
1
    fn task_context_helper() -> TaskContext {
1339
1
        let task_ctx = TaskContext::default();
1340
1
        // Create session context with config
1341
1
        let session_config = SessionConfig::new()
1342
1
            .with_batch_size(1)
1343
1
            .with_target_partitions(2)
1344
1
            .with_round_robin_repartition(false);
1345
1
        task_ctx.with_session_config(session_config)
1346
1
    }
1347
1348
1
    fn task_context() -> Arc<TaskContext> {
1349
1
        Arc::new(task_context_helper())
1350
1
    }
1351
1352
1
    pub async fn collect_stream(
1353
1
        mut stream: SendableRecordBatchStream,
1354
1
        results: &mut Vec<RecordBatch>,
1355
1
    ) -> Result<()> {
1356
553
        while let Some(
item5
) =
stream.next()6
.await {
1357
5
            results.push(item
?0
);
1358
        }
1359
0
        Ok(())
1360
0
    }
1361
1362
    /// Execute the [ExecutionPlan] and collect the results in memory
1363
1
    pub async fn collect_with_timeout(
1364
1
        plan: Arc<dyn ExecutionPlan>,
1365
1
        context: Arc<TaskContext>,
1366
1
        timeout_duration: Duration,
1367
1
    ) -> Result<Vec<RecordBatch>> {
1368
1
        let stream = execute_stream(plan, context)
?0
;
1369
1
        let mut results = vec![];
1370
1
1371
1
        // Execute the asynchronous operation with a timeout
1372
1
        if timeout(timeout_duration, collect_stream(stream, &mut results))
1373
553
            .await
1374
1
            .is_ok()
1375
        {
1376
0
            return Err(exec_datafusion_err!("shouldn't have completed"));
1377
1
        };
1378
1
1379
1
        Ok(results)
1380
1
    }
1381
1382
    /// Execute the [ExecutionPlan] and collect the results in memory
1383
    #[allow(dead_code)]
1384
0
    pub async fn collect_bonafide(
1385
0
        plan: Arc<dyn ExecutionPlan>,
1386
0
        context: Arc<TaskContext>,
1387
0
    ) -> Result<Vec<RecordBatch>> {
1388
0
        let stream = execute_stream(plan, context)?;
1389
0
        let mut results = vec![];
1390
0
1391
0
        collect_stream(stream, &mut results).await?;
1392
1393
0
        Ok(results)
1394
0
    }
1395
1396
1
    fn test_schema() -> SchemaRef {
1397
1
        Arc::new(Schema::new(vec![
1398
1
            Field::new("sn", DataType::UInt64, true),
1399
1
            Field::new("hash", DataType::Int64, true),
1400
1
        ]))
1401
1
    }
1402
1403
1
    fn schema_orders(schema: &SchemaRef) -> Result<Vec<LexOrdering>> {
1404
1
        let orderings = vec![vec![PhysicalSortExpr {
1405
1
            expr: col("sn", schema)
?0
,
1406
1
            options: SortOptions {
1407
1
                descending: false,
1408
1
                nulls_first: false,
1409
1
            },
1410
1
        }]];
1411
1
        Ok(orderings)
1412
1
    }
1413
1414
1
    fn is_integer_division_safe(lhs: usize, rhs: usize) -> bool {
1415
1
        let res = lhs / rhs;
1416
1
        res * rhs == lhs
1417
1
    }
1418
1
    fn generate_batches(
1419
1
        schema: &SchemaRef,
1420
1
        n_row: usize,
1421
1
        n_chunk: usize,
1422
1
    ) -> Result<Vec<RecordBatch>> {
1423
1
        let mut batches = vec![];
1424
1
        assert!(n_row > 0);
1425
1
        assert!(n_chunk > 0);
1426
1
        assert!(is_integer_division_safe(n_row, n_chunk));
1427
1
        let hash_replicate = 4;
1428
1
1429
1
        let chunks = (0..n_row)
1430
1
            .chunks(n_chunk)
1431
1
            .into_iter()
1432
5
            .map(|elem| elem.into_iter().collect::<Vec<_>>())
1433
1
            .collect::<Vec<_>>();
1434
1435
        // Send 2 RecordBatches at the source
1436
6
        for 
sn_values5
in chunks {
1437
5
            let mut sn1_array = UInt64Builder::with_capacity(sn_values.len());
1438
5
            let mut hash_array = Int64Builder::with_capacity(sn_values.len());
1439
1440
15
            for 
sn10
in sn_values {
1441
10
                sn1_array.append_value(sn as u64);
1442
10
                let hash_value = (2 - (sn / hash_replicate)) as i64;
1443
10
                hash_array.append_value(hash_value);
1444
10
            }
1445
1446
5
            let batch = RecordBatch::try_new(
1447
5
                Arc::clone(schema),
1448
5
                vec![Arc::new(sn1_array.finish()), Arc::new(hash_array.finish())],
1449
5
            )
?0
;
1450
5
            batches.push(batch);
1451
        }
1452
1
        Ok(batches)
1453
1
    }
1454
1455
1
    fn generate_never_ending_source(
1456
1
        n_rows: usize,
1457
1
        chunk_length: usize,
1458
1
        n_partition: usize,
1459
1
        is_infinite: bool,
1460
1
        send_exit: bool,
1461
1
        per_batch_wait_duration_in_millis: u64,
1462
1
    ) -> Result<Arc<dyn ExecutionPlan>> {
1463
1
        assert!(n_partition > 0);
1464
1465
        // We use same hash value in the table. This makes sure that
1466
        // After hashing computation will continue in only in one of the output partitions
1467
        // In this case, data flow should still continue
1468
1
        let schema = test_schema();
1469
1
        let orderings = schema_orders(&schema)
?0
;
1470
1471
        // Source waits per_batch_wait_duration_in_millis ms before sending other batch
1472
1
        let per_batch_wait_duration =
1473
1
            Duration::from_millis(per_batch_wait_duration_in_millis);
1474
1475
1
        let batches = generate_batches(&schema, n_rows, chunk_length)
?0
;
1476
1477
        // Source has 2 partitions
1478
1
        let partitions = vec![
1479
1
            Arc::new(TestStreamPartition {
1480
1
                schema: Arc::clone(&schema),
1481
1
                batches,
1482
1
                idx: 0,
1483
1
                state: PolingState::BatchReturn,
1484
1
                sleep_duration: per_batch_wait_duration,
1485
1
                send_exit,
1486
1
            }) as _;
1487
1
            n_partition
1488
1
        ];
1489
1
        let source = Arc::new(StreamingTableExec::try_new(
1490
1
            Arc::clone(&schema),
1491
1
            partitions,
1492
1
            None,
1493
1
            orderings,
1494
1
            is_infinite,
1495
1
            None,
1496
1
        )
?0
) as _;
1497
1
        Ok(source)
1498
1
    }
1499
1500
    // Tests NTH_VALUE(negative index) with memoize feature.
1501
    // To be able to trigger memoize feature for NTH_VALUE we need to
1502
    // - feed BoundedWindowAggExec with batch stream data.
1503
    // - Window frame should contain UNBOUNDED PRECEDING.
1504
    // It hard to ensure these conditions are met, from the sql query.
1505
    #[tokio::test]
1506
1
    async fn test_window_nth_value_bounded_memoize() -> Result<()> {
1507
1
        let config = SessionConfig::new().with_target_partitions(1);
1508
1
        let task_ctx = Arc::new(TaskContext::default().with_session_config(config));
1509
1
1510
1
        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
1511
1
        // Create a new batch of data to insert into the table
1512
1
        let batch = RecordBatch::try_new(
1513
1
            Arc::clone(&schema),
1514
1
            vec![Arc::new(arrow_array::Int32Array::from(vec![1, 2, 3]))],
1515
1
        )
?0
;
1516
1
1517
1
        let memory_exec = MemoryExec::try_new(
1518
1
            &[vec![batch.clone(), batch.clone(), batch.clone()]],
1519
1
            Arc::clone(&schema),
1520
1
            None,
1521
1
        )
1522
1
        .map(|e| Arc::new(e) as Arc<dyn ExecutionPlan>)
?0
;
1523
1
        let col_a = col("a", &schema)
?0
;
1524
1
        let nth_value_func1 = NthValue::nth(
1525
1
            "nth_value(-1)",
1526
1
            Arc::clone(&col_a),
1527
1
            DataType::Int32,
1528
1
            1,
1529
1
            false,
1530
1
        )
?0
1531
1
        .reverse_expr()
1532
1
        .unwrap();
1533
1
        let nth_value_func2 = NthValue::nth(
1534
1
            "nth_value(-2)",
1535
1
            Arc::clone(&col_a),
1536
1
            DataType::Int32,
1537
1
            2,
1538
1
            false,
1539
1
        )
?0
1540
1
        .reverse_expr()
1541
1
        .unwrap();
1542
1
        let last_value_func = Arc::new(NthValue::last(
1543
1
            "last",
1544
1
            Arc::clone(&col_a),
1545
1
            DataType::Int32,
1546
1
            false,
1547
1
        )) as _;
1548
1
        let window_exprs = vec![
1549
1
            // LAST_VALUE(a)
1550
1
            Arc::new(BuiltInWindowExpr::new(
1551
1
                last_value_func,
1552
1
                &[],
1553
1
                &[],
1554
1
                Arc::new(WindowFrame::new_bounds(
1555
1
                    WindowFrameUnits::Rows,
1556
1
                    WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
1557
1
                    WindowFrameBound::CurrentRow,
1558
1
                )),
1559
1
            )) as _,
1560
1
            // NTH_VALUE(a, -1)
1561
1
            Arc::new(BuiltInWindowExpr::new(
1562
1
                nth_value_func1,
1563
1
                &[],
1564
1
                &[],
1565
1
                Arc::new(WindowFrame::new_bounds(
1566
1
                    WindowFrameUnits::Rows,
1567
1
                    WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
1568
1
                    WindowFrameBound::CurrentRow,
1569
1
                )),
1570
1
            )) as _,
1571
1
            // NTH_VALUE(a, -2)
1572
1
            Arc::new(BuiltInWindowExpr::new(
1573
1
                nth_value_func2,
1574
1
                &[],
1575
1
                &[],
1576
1
                Arc::new(WindowFrame::new_bounds(
1577
1
                    WindowFrameUnits::Rows,
1578
1
                    WindowFrameBound::Preceding(ScalarValue::UInt64(None)),
1579
1
                    WindowFrameBound::CurrentRow,
1580
1
                )),
1581
1
            )) as _,
1582
1
        ];
1583
1
        let physical_plan = BoundedWindowAggExec::try_new(
1584
1
            window_exprs,
1585
1
            memory_exec,
1586
1
            vec![],
1587
1
            InputOrderMode::Sorted,
1588
1
        )
1589
1
        .map(|e| Arc::new(e) as Arc<dyn ExecutionPlan>)
?0
;
1590
1
1591
1
        let batches = collect(physical_plan.execute(0, task_ctx)
?0
).
await0
?0
;
1592
1
1593
1
        let expected = vec![
1594
1
            "BoundedWindowAggExec: wdw=[last: Ok(Field { name: \"last\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-1): Ok(Field { name: \"nth_value(-1)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-2): Ok(Field { name: \"nth_value(-2)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]",
1595
1
            "  MemoryExec: partitions=1, partition_sizes=[3]",
1596
1
        ];
1597
1
        // Get string representation of the plan
1598
1
        let actual = get_plan_string(&physical_plan);
1599
1
        assert_eq!(
1600
1
            expected, actual,
1601
1
            
"\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"0
1602
1
        );
1603
1
1604
1
        let expected = [
1605
1
            "+---+------+---------------+---------------+",
1606
1
            "| a | last | nth_value(-1) | nth_value(-2) |",
1607
1
            "+---+------+---------------+---------------+",
1608
1
            "| 1 | 1    | 1             |               |",
1609
1
            "| 2 | 2    | 2             | 1             |",
1610
1
            "| 3 | 3    | 3             | 2             |",
1611
1
            "| 1 | 1    | 1             | 3             |",
1612
1
            "| 2 | 2    | 2             | 1             |",
1613
1
            "| 3 | 3    | 3             | 2             |",
1614
1
            "| 1 | 1    | 1             | 3             |",
1615
1
            "| 2 | 2    | 2             | 1             |",
1616
1
            "| 3 | 3    | 3             | 2             |",
1617
1
            "+---+------+---------------+---------------+",
1618
1
        ];
1619
1
        assert_batches_eq!(expected, &batches);
1620
1
        Ok(())
1621
1
    }
1622
1623
    // This test, tests whether most recent row guarantee by the input batch of the `BoundedWindowAggExec`
1624
    // helps `BoundedWindowAggExec` to generate low latency result in the `Linear` mode.
1625
    // Input data generated at the source is
1626
    //       "+----+------+",
1627
    //       "| sn | hash |",
1628
    //       "+----+------+",
1629
    //       "| 0  | 2    |",
1630
    //       "| 1  | 2    |",
1631
    //       "| 2  | 2    |",
1632
    //       "| 3  | 2    |",
1633
    //       "| 4  | 1    |",
1634
    //       "| 5  | 1    |",
1635
    //       "| 6  | 1    |",
1636
    //       "| 7  | 1    |",
1637
    //       "| 8  | 0    |",
1638
    //       "| 9  | 0    |",
1639
    //       "+----+------+",
1640
    //
1641
    // Effectively following query is run on this data
1642
    //
1643
    //   SELECT *, count(*) OVER(PARTITION BY duplicated_hash ORDER BY sn RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING)
1644
    //   FROM test;
1645
    //
1646
    // partition `duplicated_hash=2` receives following data from the input
1647
    //
1648
    //       "+----+------+",
1649
    //       "| sn | hash |",
1650
    //       "+----+------+",
1651
    //       "| 0  | 2    |",
1652
    //       "| 1  | 2    |",
1653
    //       "| 2  | 2    |",
1654
    //       "| 3  | 2    |",
1655
    //       "+----+------+",
1656
    // normally `BoundedWindowExec` can only generate following result from the input above
1657
    //
1658
    //       "+----+------+---------+",
1659
    //       "| sn | hash |  count  |",
1660
    //       "+----+------+---------+",
1661
    //       "| 0  | 2    |  2      |",
1662
    //       "| 1  | 2    |  2      |",
1663
    //       "| 2  | 2    |<not yet>|",
1664
    //       "| 3  | 2    |<not yet>|",
1665
    //       "+----+------+---------+",
1666
    // where result of last 2 row is missing. Since window frame end is not may change with future data
1667
    // since window frame end is determined by 1 following (To generate result for row=3[where sn=2] we
1668
    // need to received sn=4 to make sure window frame end bound won't change with future data).
1669
    //
1670
    // With the ability of different partitions to use global ordering at the input (where most up-to date
1671
    //   row is
1672
    //      "| 9  | 0    |",
1673
    //   )
1674
    //
1675
    // `BoundedWindowExec` should be able to generate following result in the test
1676
    //
1677
    //       "+----+------+-------+",
1678
    //       "| sn | hash | col_2 |",
1679
    //       "+----+------+-------+",
1680
    //       "| 0  | 2    | 2     |",
1681
    //       "| 1  | 2    | 2     |",
1682
    //       "| 2  | 2    | 2     |",
1683
    //       "| 3  | 2    | 1     |",
1684
    //       "| 4  | 1    | 2     |",
1685
    //       "| 5  | 1    | 2     |",
1686
    //       "| 6  | 1    | 2     |",
1687
    //       "| 7  | 1    | 1     |",
1688
    //       "+----+------+-------+",
1689
    //
1690
    // where result for all rows except last 2 is calculated (To calculate result for row 9 where sn=8
1691
    //   we need to receive sn=10 value to calculate it result.).
1692
    // In this test, out aim is to test for which portion of the input data `BoundedWindowExec` can generate
1693
    // a result. To test this behaviour, we generated the data at the source infinitely (no `None` signal
1694
    //    is sent to output from source). After, row:
1695
    //
1696
    //       "| 9  | 0    |",
1697
    //
1698
    // is sent. Source stops sending data to output. We collect, result emitted by the `BoundedWindowExec` at the
1699
    // end of the pipeline with a timeout (Since no `None` is sent from source. Collection never ends otherwise).
1700
    #[tokio::test]
1701
1
    async fn bounded_window_exec_linear_mode_range_information() -> Result<()> {
1702
1
        let n_rows = 10;
1703
1
        let chunk_length = 2;
1704
1
        let n_future_range = 1;
1705
1
1706
1
        let timeout_duration = Duration::from_millis(2000);
1707
1
1708
1
        let source =
1709
1
            generate_never_ending_source(n_rows, chunk_length, 1, true, false, 5)
?0
;
1710
1
1711
1
        let window =
1712
1
            bounded_window_exec_pb_latent_range(source, n_future_range, "hash", "sn")
?0
;
1713
1
1714
1
        let plan = projection_exec(window)
?0
;
1715
1
1716
1
        let expected_plan = vec![
1717
1
            "ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]@2 as col_2]",
1718
1
            "  BoundedWindowAggExec: wdw=[count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Ok(Field { name: \"count([Column { name: \\\"sn\\\", index: 0 }]) PARTITION BY: [[Column { name: \\\"hash\\\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \\\"sn\\\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(1)), is_causal: false }], mode=[Linear]",
1719
1
            "    StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST]",
1720
1
        ];
1721
1
1722
1
        // Get string representation of the plan
1723
1
        let actual = get_plan_string(&plan);
1724
1
        assert_eq!(
1725
1
            expected_plan, actual,
1726
1
            
"\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_plan:#?}\nactual:\n\n{actual:#?}\n\n"0
1727
1
        );
1728
1
1729
1
        let task_ctx = task_context();
1730
553
        let 
batches1
=
collect_with_timeout(plan, task_ctx, timeout_duration)1
.await
?0
;
1731
1
1732
1
        let expected = [
1733
1
            "+----+------+-------+",
1734
1
            "| sn | hash | col_2 |",
1735
1
            "+----+------+-------+",
1736
1
            "| 0  | 2    | 2     |",
1737
1
            "| 1  | 2    | 2     |",
1738
1
            "| 2  | 2    | 2     |",
1739
1
            "| 3  | 2    | 1     |",
1740
1
            "| 4  | 1    | 2     |",
1741
1
            "| 5  | 1    | 2     |",
1742
1
            "| 6  | 1    | 2     |",
1743
1
            "| 7  | 1    | 1     |",
1744
1
            "+----+------+-------+",
1745
1
        ];
1746
1
        assert_batches_eq!(expected, &batches);
1747
1
1748
1
        Ok(())
1749
1
    }
1750
}