/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Stream and channel implementations for window function expressions. |
19 | | //! The executor given here uses bounded memory (does not maintain all |
20 | | //! the input data seen so far), which makes it appropriate when processing |
21 | | //! infinite inputs. |
22 | | |
23 | | use std::any::Any; |
24 | | use std::cmp::{min, Ordering}; |
25 | | use std::collections::{HashMap, VecDeque}; |
26 | | use std::pin::Pin; |
27 | | use std::sync::Arc; |
28 | | use std::task::{Context, Poll}; |
29 | | |
30 | | use super::utils::create_schema; |
31 | | use crate::expressions::PhysicalSortExpr; |
32 | | use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; |
33 | | use crate::windows::{ |
34 | | calc_requirements, get_ordered_partition_by_indices, get_partition_by_sort_exprs, |
35 | | window_equivalence_properties, |
36 | | }; |
37 | | use crate::{ |
38 | | ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, |
39 | | ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream, |
40 | | SendableRecordBatchStream, Statistics, WindowExpr, |
41 | | }; |
42 | | use ahash::RandomState; |
43 | | use arrow::{ |
44 | | array::{Array, ArrayRef, RecordBatchOptions, UInt32Builder}, |
45 | | compute::{concat, concat_batches, sort_to_indices}, |
46 | | datatypes::SchemaRef, |
47 | | record_batch::RecordBatch, |
48 | | }; |
49 | | use datafusion_common::hash_utils::create_hashes; |
50 | | use datafusion_common::stats::Precision; |
51 | | use datafusion_common::utils::{ |
52 | | evaluate_partition_ranges, get_at_indices, get_record_batch_at_indices, |
53 | | get_row_at_idx, take_arrays, |
54 | | }; |
55 | | use datafusion_common::{arrow_datafusion_err, exec_err, DataFusionError, Result}; |
56 | | use datafusion_execution::TaskContext; |
57 | | use datafusion_expr::window_state::{PartitionBatchState, WindowAggState}; |
58 | | use datafusion_expr::ColumnarValue; |
59 | | use datafusion_physical_expr::window::{ |
60 | | PartitionBatches, PartitionKey, PartitionWindowAggStates, WindowState, |
61 | | }; |
62 | | use datafusion_physical_expr::PhysicalExpr; |
63 | | use datafusion_physical_expr_common::sort_expr::LexRequirement; |
64 | | use futures::stream::Stream; |
65 | | use futures::{ready, StreamExt}; |
66 | | use hashbrown::raw::RawTable; |
67 | | use indexmap::IndexMap; |
68 | | use log::debug; |
69 | | |
70 | | /// Window execution plan |
71 | | #[derive(Debug)] |
72 | | pub struct BoundedWindowAggExec { |
73 | | /// Input plan |
74 | | input: Arc<dyn ExecutionPlan>, |
75 | | /// Window function expression |
76 | | window_expr: Vec<Arc<dyn WindowExpr>>, |
77 | | /// Schema after the window is run |
78 | | schema: SchemaRef, |
79 | | /// Partition Keys |
80 | | pub partition_keys: Vec<Arc<dyn PhysicalExpr>>, |
81 | | /// Execution metrics |
82 | | metrics: ExecutionPlanMetricsSet, |
83 | | /// Describes how the input is ordered relative to the partition keys |
84 | | pub input_order_mode: InputOrderMode, |
85 | | /// Partition by indices that define ordering |
86 | | // For example, if input ordering is ORDER BY a, b and window expression |
87 | | // contains PARTITION BY b, a; `ordered_partition_by_indices` would be 1, 0. |
88 | | // Similarly, if window expression contains PARTITION BY a, b; then |
89 | | // `ordered_partition_by_indices` would be 0, 1. |
90 | | // See `get_ordered_partition_by_indices` for more details. |
91 | | ordered_partition_by_indices: Vec<usize>, |
92 | | /// Cache holding plan properties like equivalences, output partitioning etc. |
93 | | cache: PlanProperties, |
94 | | } |
95 | | |
96 | | impl BoundedWindowAggExec { |
97 | | /// Create a new execution plan for window aggregates |
98 | 2 | pub fn try_new( |
99 | 2 | window_expr: Vec<Arc<dyn WindowExpr>>, |
100 | 2 | input: Arc<dyn ExecutionPlan>, |
101 | 2 | partition_keys: Vec<Arc<dyn PhysicalExpr>>, |
102 | 2 | input_order_mode: InputOrderMode, |
103 | 2 | ) -> Result<Self> { |
104 | 2 | let schema = create_schema(&input.schema(), &window_expr)?0 ; |
105 | 2 | let schema = Arc::new(schema); |
106 | 2 | let partition_by_exprs = window_expr[0].partition_by(); |
107 | 2 | let ordered_partition_by_indices = match &input_order_mode { |
108 | | InputOrderMode::Sorted => { |
109 | 1 | let indices = get_ordered_partition_by_indices( |
110 | 1 | window_expr[0].partition_by(), |
111 | 1 | &input, |
112 | 1 | ); |
113 | 1 | if indices.len() == partition_by_exprs.len() { |
114 | 1 | indices |
115 | | } else { |
116 | 0 | (0..partition_by_exprs.len()).collect::<Vec<_>>() |
117 | | } |
118 | | } |
119 | 0 | InputOrderMode::PartiallySorted(ordered_indices) => ordered_indices.clone(), |
120 | | InputOrderMode::Linear => { |
121 | 1 | vec![] |
122 | | } |
123 | | }; |
124 | 2 | let cache = Self::compute_properties(&input, &schema, &window_expr); |
125 | 2 | Ok(Self { |
126 | 2 | input, |
127 | 2 | window_expr, |
128 | 2 | schema, |
129 | 2 | partition_keys, |
130 | 2 | metrics: ExecutionPlanMetricsSet::new(), |
131 | 2 | input_order_mode, |
132 | 2 | ordered_partition_by_indices, |
133 | 2 | cache, |
134 | 2 | }) |
135 | 2 | } |
136 | | |
137 | | /// Window expressions |
138 | 3 | pub fn window_expr(&self) -> &[Arc<dyn WindowExpr>] { |
139 | 3 | &self.window_expr |
140 | 3 | } |
141 | | |
142 | | /// Input plan |
143 | 2 | pub fn input(&self) -> &Arc<dyn ExecutionPlan> { |
144 | 2 | &self.input |
145 | 2 | } |
146 | | |
147 | | /// Return the output sort order of partition keys: For example |
148 | | /// OVER(PARTITION BY a, ORDER BY b) -> would give sorting of the column a |
149 | | // We are sure that partition by columns are always at the beginning of sort_keys |
150 | | // Hence returned `PhysicalSortExpr` corresponding to `PARTITION BY` columns can be used safely |
151 | | // to calculate partition separation points |
152 | 2 | pub fn partition_by_sort_keys(&self) -> Result<Vec<PhysicalSortExpr>> { |
153 | 2 | let partition_by = self.window_expr()[0].partition_by(); |
154 | 2 | get_partition_by_sort_exprs( |
155 | 2 | &self.input, |
156 | 2 | partition_by, |
157 | 2 | &self.ordered_partition_by_indices, |
158 | 2 | ) |
159 | 2 | } |
160 | | |
161 | | /// Initializes the appropriate [`PartitionSearcher`] implementation from |
162 | | /// the state. |
163 | 2 | fn get_search_algo(&self) -> Result<Box<dyn PartitionSearcher>> { |
164 | 2 | let partition_by_sort_keys = self.partition_by_sort_keys()?0 ; |
165 | 2 | let ordered_partition_by_indices = self.ordered_partition_by_indices.clone(); |
166 | 2 | let input_schema = self.input().schema(); |
167 | 2 | Ok(match &self.input_order_mode { |
168 | | InputOrderMode::Sorted => { |
169 | | // In Sorted mode, all partition by columns should be ordered. |
170 | 1 | if self.window_expr()[0].partition_by().len() |
171 | 1 | != ordered_partition_by_indices.len() |
172 | | { |
173 | 0 | return exec_err!("All partition by columns should have an ordering in Sorted mode."); |
174 | 1 | } |
175 | 1 | Box::new(SortedSearch { |
176 | 1 | partition_by_sort_keys, |
177 | 1 | ordered_partition_by_indices, |
178 | 1 | input_schema, |
179 | 1 | }) |
180 | | } |
181 | 1 | InputOrderMode::Linear | InputOrderMode::PartiallySorted(_) => Box::new( |
182 | 1 | LinearSearch::new(ordered_partition_by_indices, input_schema), |
183 | 1 | ), |
184 | | }) |
185 | 2 | } |
186 | | |
187 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
188 | 2 | fn compute_properties( |
189 | 2 | input: &Arc<dyn ExecutionPlan>, |
190 | 2 | schema: &SchemaRef, |
191 | 2 | window_expr: &[Arc<dyn WindowExpr>], |
192 | 2 | ) -> PlanProperties { |
193 | 2 | // Calculate equivalence properties: |
194 | 2 | let eq_properties = window_equivalence_properties(schema, input, window_expr); |
195 | 2 | |
196 | 2 | // As we can have repartitioning using the partition keys, this can |
197 | 2 | // be either one or more than one, depending on the presence of |
198 | 2 | // repartitioning. |
199 | 2 | let output_partitioning = input.output_partitioning().clone(); |
200 | 2 | |
201 | 2 | // Construct properties cache |
202 | 2 | PlanProperties::new( |
203 | 2 | eq_properties, // Equivalence Properties |
204 | 2 | output_partitioning, // Output Partitioning |
205 | 2 | input.execution_mode(), // Execution Mode |
206 | 2 | ) |
207 | 2 | } |
208 | | } |
209 | | |
210 | | impl DisplayAs for BoundedWindowAggExec { |
211 | 2 | fn fmt_as( |
212 | 2 | &self, |
213 | 2 | t: DisplayFormatType, |
214 | 2 | f: &mut std::fmt::Formatter, |
215 | 2 | ) -> std::fmt::Result { |
216 | 2 | match t { |
217 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
218 | 2 | write!(f, "BoundedWindowAggExec: ")?0 ; |
219 | 2 | let g: Vec<String> = self |
220 | 2 | .window_expr |
221 | 2 | .iter() |
222 | 4 | .map(|e| { |
223 | 4 | format!( |
224 | 4 | "{}: {:?}, frame: {:?}", |
225 | 4 | e.name().to_owned(), |
226 | 4 | e.field(), |
227 | 4 | e.get_window_frame() |
228 | 4 | ) |
229 | 4 | }) |
230 | 2 | .collect(); |
231 | 2 | let mode = &self.input_order_mode; |
232 | 2 | write!(f, "wdw=[{}], mode=[{:?}]", g.join(", "), mode)?0 ; |
233 | | } |
234 | | } |
235 | 2 | Ok(()) |
236 | 2 | } |
237 | | } |
238 | | |
239 | | impl ExecutionPlan for BoundedWindowAggExec { |
240 | 0 | fn name(&self) -> &'static str { |
241 | 0 | "BoundedWindowAggExec" |
242 | 0 | } |
243 | | |
244 | | /// Return a reference to Any that can be used for downcasting |
245 | 0 | fn as_any(&self) -> &dyn Any { |
246 | 0 | self |
247 | 0 | } |
248 | | |
249 | 6 | fn properties(&self) -> &PlanProperties { |
250 | 6 | &self.cache |
251 | 6 | } |
252 | | |
253 | 2 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
254 | 2 | vec![&self.input] |
255 | 2 | } |
256 | | |
257 | 0 | fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> { |
258 | 0 | let partition_bys = self.window_expr()[0].partition_by(); |
259 | 0 | let order_keys = self.window_expr()[0].order_by(); |
260 | 0 | let partition_bys = self |
261 | 0 | .ordered_partition_by_indices |
262 | 0 | .iter() |
263 | 0 | .map(|idx| &partition_bys[*idx]); |
264 | 0 | vec![calc_requirements(partition_bys, order_keys)] |
265 | 0 | } |
266 | | |
267 | 0 | fn required_input_distribution(&self) -> Vec<Distribution> { |
268 | 0 | if self.partition_keys.is_empty() { |
269 | 0 | debug!("No partition defined for BoundedWindowAggExec!!!"); |
270 | 0 | vec![Distribution::SinglePartition] |
271 | | } else { |
272 | 0 | vec![Distribution::HashPartitioned(self.partition_keys.clone())] |
273 | | } |
274 | 0 | } |
275 | | |
276 | 0 | fn maintains_input_order(&self) -> Vec<bool> { |
277 | 0 | vec![true] |
278 | 0 | } |
279 | | |
280 | 0 | fn with_new_children( |
281 | 0 | self: Arc<Self>, |
282 | 0 | children: Vec<Arc<dyn ExecutionPlan>>, |
283 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
284 | 0 | Ok(Arc::new(BoundedWindowAggExec::try_new( |
285 | 0 | self.window_expr.clone(), |
286 | 0 | Arc::clone(&children[0]), |
287 | 0 | self.partition_keys.clone(), |
288 | 0 | self.input_order_mode.clone(), |
289 | 0 | )?)) |
290 | 0 | } |
291 | | |
292 | 2 | fn execute( |
293 | 2 | &self, |
294 | 2 | partition: usize, |
295 | 2 | context: Arc<TaskContext>, |
296 | 2 | ) -> Result<SendableRecordBatchStream> { |
297 | 2 | let input = self.input.execute(partition, context)?0 ; |
298 | 2 | let search_mode = self.get_search_algo()?0 ; |
299 | 2 | let stream = Box::pin(BoundedWindowAggStream::new( |
300 | 2 | Arc::clone(&self.schema), |
301 | 2 | self.window_expr.clone(), |
302 | 2 | input, |
303 | 2 | BaselineMetrics::new(&self.metrics, partition), |
304 | 2 | search_mode, |
305 | 2 | )?0 ); |
306 | 2 | Ok(stream) |
307 | 2 | } |
308 | | |
309 | 0 | fn metrics(&self) -> Option<MetricsSet> { |
310 | 0 | Some(self.metrics.clone_inner()) |
311 | 0 | } |
312 | | |
313 | 0 | fn statistics(&self) -> Result<Statistics> { |
314 | 0 | let input_stat = self.input.statistics()?; |
315 | 0 | let win_cols = self.window_expr.len(); |
316 | 0 | let input_cols = self.input.schema().fields().len(); |
317 | 0 | // TODO stats: some windowing function will maintain invariants such as min, max... |
318 | 0 | let mut column_statistics = Vec::with_capacity(win_cols + input_cols); |
319 | 0 | // copy stats of the input to the beginning of the schema. |
320 | 0 | column_statistics.extend(input_stat.column_statistics); |
321 | 0 | for _ in 0..win_cols { |
322 | 0 | column_statistics.push(ColumnStatistics::new_unknown()) |
323 | | } |
324 | 0 | Ok(Statistics { |
325 | 0 | num_rows: input_stat.num_rows, |
326 | 0 | column_statistics, |
327 | 0 | total_byte_size: Precision::Absent, |
328 | 0 | }) |
329 | 0 | } |
330 | | } |
331 | | |
332 | | /// Trait that specifies how we search for (or calculate) partitions. It has two |
333 | | /// implementations: [`SortedSearch`] and [`LinearSearch`]. |
334 | | trait PartitionSearcher: Send { |
335 | | /// This method constructs output columns using the result of each window expression |
336 | | /// (each entry in the output vector comes from a window expression). |
337 | | /// Executor when producing output concatenates `input_buffer` (corresponding section), and |
338 | | /// result of this function to generate output `RecordBatch`. `input_buffer` is used to determine |
339 | | /// which sections of the window expression results should be used to generate output. |
340 | | /// `partition_buffers` contains corresponding section of the `RecordBatch` for each partition. |
341 | | /// `window_agg_states` stores per partition state for each window expression. |
342 | | /// None case means that no result is generated |
343 | | /// `Some(Vec<ArrayRef>)` is the result of each window expression. |
344 | | fn calculate_out_columns( |
345 | | &mut self, |
346 | | input_buffer: &RecordBatch, |
347 | | window_agg_states: &[PartitionWindowAggStates], |
348 | | partition_buffers: &mut PartitionBatches, |
349 | | window_expr: &[Arc<dyn WindowExpr>], |
350 | | ) -> Result<Option<Vec<ArrayRef>>>; |
351 | | |
352 | | /// Determine whether `[InputOrderMode]` is `[InputOrderMode::Linear]` or not. |
353 | 3 | fn is_mode_linear(&self) -> bool { |
354 | 3 | false |
355 | 3 | } |
356 | | |
357 | | // Constructs corresponding batches for each partition for the record_batch. |
358 | | fn evaluate_partition_batches( |
359 | | &mut self, |
360 | | record_batch: &RecordBatch, |
361 | | window_expr: &[Arc<dyn WindowExpr>], |
362 | | ) -> Result<Vec<(PartitionKey, RecordBatch)>>; |
363 | | |
364 | | /// Prunes the state. |
365 | 3 | fn prune(&mut self, _n_out: usize) {} |
366 | | |
367 | | /// Marks the partition as done if we are sure that corresponding partition |
368 | | /// cannot receive any more values. |
369 | | fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches); |
370 | | |
371 | | /// Updates `input_buffer` and `partition_buffers` with the new `record_batch`. |
372 | 8 | fn update_partition_batch( |
373 | 8 | &mut self, |
374 | 8 | input_buffer: &mut RecordBatch, |
375 | 8 | record_batch: RecordBatch, |
376 | 8 | window_expr: &[Arc<dyn WindowExpr>], |
377 | 8 | partition_buffers: &mut PartitionBatches, |
378 | 8 | ) -> Result<()> { |
379 | 8 | if record_batch.num_rows() == 0 { |
380 | 0 | return Ok(()); |
381 | 8 | } |
382 | 8 | let partition_batches = |
383 | 8 | self.evaluate_partition_batches(&record_batch, window_expr)?0 ; |
384 | 16 | for (partition_row, partition_batch8 ) in partition_batches { |
385 | 8 | let partition_batch_state = partition_buffers |
386 | 8 | .entry(partition_row) |
387 | 8 | // Use input_schema for the buffer schema, not `record_batch.schema()` |
388 | 8 | // as it may not have the "correct" schema in terms of output |
389 | 8 | // nullability constraints. For details, see the following issue: |
390 | 8 | // https://github.com/apache/datafusion/issues/9320 |
391 | 8 | .or_insert_with(|| { |
392 | 4 | PartitionBatchState::new(Arc::clone(self.input_schema())) |
393 | 8 | }); |
394 | 8 | partition_batch_state.extend(&partition_batch)?0 ; |
395 | | } |
396 | | |
397 | 8 | if self.is_mode_linear() { |
398 | | // In `Linear` mode, it is guaranteed that the first ORDER BY column |
399 | | // is sorted across partitions. Note that only the first ORDER BY |
400 | | // column is guaranteed to be ordered. As a counter example, consider |
401 | | // the case, `PARTITION BY b, ORDER BY a, c` when the input is sorted |
402 | | // by `[a, b, c]`. In this case, `BoundedWindowAggExec` mode will be |
403 | | // `Linear`. However, we cannot guarantee that the last row of the |
404 | | // input data will be the "last" data in terms of the ordering requirement |
405 | | // `[a, c]` -- it will be the "last" data in terms of `[a, b, c]`. |
406 | | // Hence, only column `a` should be used as a guarantee of the "last" |
407 | | // data across partitions. For other modes (`Sorted`, `PartiallySorted`), |
408 | | // we do not need to keep track of the most recent row guarantee across |
409 | | // partitions. Since leading ordering separates partitions, guaranteed |
410 | | // by the most recent row, already prune the previous partitions completely. |
411 | 5 | let last_row = get_last_row_batch(&record_batch)?0 ; |
412 | 9 | for (_, partition_batch) in partition_buffers.iter_mut()5 { |
413 | 9 | partition_batch.set_most_recent_row(last_row.clone()); |
414 | 9 | } |
415 | 3 | } |
416 | 8 | self.mark_partition_end(partition_buffers); |
417 | | |
418 | 8 | *input_buffer = if input_buffer.num_rows() == 0 { |
419 | 4 | record_batch |
420 | | } else { |
421 | 4 | concat_batches(self.input_schema(), [input_buffer, &record_batch])?0 |
422 | | }; |
423 | | |
424 | 8 | Ok(()) |
425 | 8 | } |
426 | | |
427 | | fn input_schema(&self) -> &SchemaRef; |
428 | | } |
429 | | |
430 | | /// This object encapsulates the algorithm state for a simple linear scan |
431 | | /// algorithm for computing partitions. |
432 | | pub struct LinearSearch { |
433 | | /// Keeps the hash of input buffer calculated from PARTITION BY columns. |
434 | | /// Its length is equal to the `input_buffer` length. |
435 | | input_buffer_hashes: VecDeque<u64>, |
436 | | /// Used during hash value calculation. |
437 | | random_state: RandomState, |
438 | | /// Input ordering and partition by key ordering need not be the same, so |
439 | | /// this vector stores the mapping between them. For instance, if the input |
440 | | /// is ordered by a, b and the window expression contains a PARTITION BY b, a |
441 | | /// clause, this attribute stores [1, 0]. |
442 | | ordered_partition_by_indices: Vec<usize>, |
443 | | /// We use this [`RawTable`] to calculate unique partitions for each new |
444 | | /// RecordBatch. First entry in the tuple is the hash value, the second |
445 | | /// entry is the unique ID for each partition (increments from 0 to n). |
446 | | row_map_batch: RawTable<(u64, usize)>, |
447 | | /// We use this [`RawTable`] to calculate the output columns that we can |
448 | | /// produce at each cycle. First entry in the tuple is the hash value, the |
449 | | /// second entry is the unique ID for each partition (increments from 0 to n). |
450 | | /// The third entry stores how many new outputs are calculated for the |
451 | | /// corresponding partition. |
452 | | row_map_out: RawTable<(u64, usize, usize)>, |
453 | | input_schema: SchemaRef, |
454 | | } |
455 | | |
456 | | impl PartitionSearcher for LinearSearch { |
457 | | /// This method constructs output columns using the result of each window expression. |
458 | | // Assume input buffer is | Partition Buffers would be (Where each partition and its data is seperated) |
459 | | // a, 2 | a, 2 |
460 | | // b, 2 | a, 2 |
461 | | // a, 2 | a, 2 |
462 | | // b, 2 | |
463 | | // a, 2 | b, 2 |
464 | | // b, 2 | b, 2 |
465 | | // b, 2 | b, 2 |
466 | | // | b, 2 |
467 | | // Also assume we happen to calculate 2 new values for a, and 3 for b (To be calculate missing values we may need to consider future values). |
468 | | // Partition buffers effectively will be |
469 | | // a, 2, 1 |
470 | | // a, 2, 2 |
471 | | // a, 2, (missing) |
472 | | // |
473 | | // b, 2, 1 |
474 | | // b, 2, 2 |
475 | | // b, 2, 3 |
476 | | // b, 2, (missing) |
477 | | // When partition buffers are mapped back to the original record batch. Result becomes |
478 | | // a, 2, 1 |
479 | | // b, 2, 1 |
480 | | // a, 2, 2 |
481 | | // b, 2, 2 |
482 | | // a, 2, (missing) |
483 | | // b, 2, 3 |
484 | | // b, 2, (missing) |
485 | | // This function calculates the column result of window expression(s) (First 4 entry of 3rd column in the above section.) |
486 | | // 1 |
487 | | // 1 |
488 | | // 2 |
489 | | // 2 |
490 | | // Above section corresponds to calculated result which can be emitted without breaking input buffer ordering. |
491 | 5 | fn calculate_out_columns( |
492 | 5 | &mut self, |
493 | 5 | input_buffer: &RecordBatch, |
494 | 5 | window_agg_states: &[PartitionWindowAggStates], |
495 | 5 | partition_buffers: &mut PartitionBatches, |
496 | 5 | window_expr: &[Arc<dyn WindowExpr>], |
497 | 5 | ) -> Result<Option<Vec<ArrayRef>>> { |
498 | 5 | let partition_output_indices = self.calc_partition_output_indices( |
499 | 5 | input_buffer, |
500 | 5 | window_agg_states, |
501 | 5 | window_expr, |
502 | 5 | )?0 ; |
503 | | |
504 | 5 | let n_window_col = window_agg_states.len(); |
505 | 5 | let mut new_columns = vec![vec![]; n_window_col]; |
506 | 5 | // Size of all_indices can be at most input_buffer.num_rows(): |
507 | 5 | let mut all_indices = UInt32Builder::with_capacity(input_buffer.num_rows()); |
508 | 9 | for (row, indices4 ) in partition_output_indices { |
509 | 4 | let length = indices.len(); |
510 | 4 | for (idx, window_agg_state) in window_agg_states.iter().enumerate() { |
511 | 4 | let partition = &window_agg_state[&row]; |
512 | 4 | let values = Arc::clone(&partition.state.out_col.slice(0, length)); |
513 | 4 | new_columns[idx].push(values); |
514 | 4 | } |
515 | 4 | let partition_batch_state = &mut partition_buffers[&row]; |
516 | 4 | // Store how many rows are generated for each partition |
517 | 4 | partition_batch_state.n_out_row = length; |
518 | 4 | // For each row keep corresponding index in the input record batch |
519 | 4 | all_indices.append_slice(&indices); |
520 | | } |
521 | 5 | let all_indices = all_indices.finish(); |
522 | 5 | if all_indices.is_empty() { |
523 | | // We couldn't generate any new value, return early: |
524 | 1 | return Ok(None); |
525 | 4 | } |
526 | | |
527 | | // Concatenate results for each column by converting `Vec<Vec<ArrayRef>>` |
528 | | // to Vec<ArrayRef> where inner `Vec<ArrayRef>`s are converted to `ArrayRef`s. |
529 | 4 | let new_columns = new_columns |
530 | 4 | .iter() |
531 | 4 | .map(|items| { |
532 | 4 | concat(&items.iter().map(|e| e.as_ref()).collect::<Vec<_>>()) |
533 | 4 | .map_err(|e| arrow_datafusion_err!(e)0 ) |
534 | 4 | }) |
535 | 4 | .collect::<Result<Vec<_>>>()?0 ; |
536 | | // We should emit columns according to row index ordering. |
537 | 4 | let sorted_indices = sort_to_indices(&all_indices, None, None)?0 ; |
538 | | // Construct new column according to row ordering. This fixes ordering |
539 | 4 | take_arrays(&new_columns, &sorted_indices).map(Some) |
540 | 5 | } |
541 | | |
542 | 5 | fn evaluate_partition_batches( |
543 | 5 | &mut self, |
544 | 5 | record_batch: &RecordBatch, |
545 | 5 | window_expr: &[Arc<dyn WindowExpr>], |
546 | 5 | ) -> Result<Vec<(PartitionKey, RecordBatch)>> { |
547 | 5 | let partition_bys = |
548 | 5 | evaluate_partition_by_column_values(record_batch, window_expr)?0 ; |
549 | | // NOTE: In Linear or PartiallySorted modes, we are sure that |
550 | | // `partition_bys` are not empty. |
551 | | // Calculate indices for each partition and construct a new record |
552 | | // batch from the rows at these indices for each partition: |
553 | 5 | self.get_per_partition_indices(&partition_bys, record_batch)?0 |
554 | 5 | .into_iter() |
555 | 5 | .map(|(row, indices)| { |
556 | 5 | let mut new_indices = UInt32Builder::with_capacity(indices.len()); |
557 | 5 | new_indices.append_slice(&indices); |
558 | 5 | let indices = new_indices.finish(); |
559 | 5 | Ok((row, get_record_batch_at_indices(record_batch, &indices)?0 )) |
560 | 5 | }) |
561 | 5 | .collect() |
562 | 5 | } |
563 | | |
564 | 4 | fn prune(&mut self, n_out: usize) { |
565 | 4 | // Delete hashes for the rows that are outputted. |
566 | 4 | self.input_buffer_hashes.drain(0..n_out); |
567 | 4 | } |
568 | | |
569 | 5 | fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) { |
570 | 5 | // We should be in the `PartiallySorted` case, otherwise we can not |
571 | 5 | // tell when we are at the end of a given partition. |
572 | 5 | if !self.ordered_partition_by_indices.is_empty() { |
573 | 0 | if let Some((last_row, _)) = partition_buffers.last() { |
574 | 0 | let last_sorted_cols = self |
575 | 0 | .ordered_partition_by_indices |
576 | 0 | .iter() |
577 | 0 | .map(|idx| last_row[*idx].clone()) |
578 | 0 | .collect::<Vec<_>>(); |
579 | 0 | for (row, partition_batch_state) in partition_buffers.iter_mut() { |
580 | 0 | let sorted_cols = self |
581 | 0 | .ordered_partition_by_indices |
582 | 0 | .iter() |
583 | 0 | .map(|idx| &row[*idx]); |
584 | 0 | // All the partitions other than `last_sorted_cols` are done. |
585 | 0 | // We are sure that we will no longer receive values for these |
586 | 0 | // partitions (arrival of a new value would violate ordering). |
587 | 0 | partition_batch_state.is_end = !sorted_cols.eq(&last_sorted_cols); |
588 | 0 | } |
589 | 0 | } |
590 | 5 | } |
591 | 5 | } |
592 | | |
593 | 5 | fn is_mode_linear(&self) -> bool { |
594 | 5 | self.ordered_partition_by_indices.is_empty() |
595 | 5 | } |
596 | | |
597 | 7 | fn input_schema(&self) -> &SchemaRef { |
598 | 7 | &self.input_schema |
599 | 7 | } |
600 | | } |
601 | | |
602 | | impl LinearSearch { |
603 | | /// Initialize a new [`LinearSearch`] partition searcher. |
604 | 1 | fn new(ordered_partition_by_indices: Vec<usize>, input_schema: SchemaRef) -> Self { |
605 | 1 | LinearSearch { |
606 | 1 | input_buffer_hashes: VecDeque::new(), |
607 | 1 | random_state: Default::default(), |
608 | 1 | ordered_partition_by_indices, |
609 | 1 | row_map_batch: RawTable::with_capacity(256), |
610 | 1 | row_map_out: RawTable::with_capacity(256), |
611 | 1 | input_schema, |
612 | 1 | } |
613 | 1 | } |
614 | | |
615 | | /// Calculate indices of each partition (according to PARTITION BY expression) |
616 | | /// `columns` contain partition by expression results. |
617 | 5 | fn get_per_partition_indices( |
618 | 5 | &mut self, |
619 | 5 | columns: &[ArrayRef], |
620 | 5 | batch: &RecordBatch, |
621 | 5 | ) -> Result<Vec<(PartitionKey, Vec<u32>)>> { |
622 | 5 | let mut batch_hashes = vec![0; batch.num_rows()]; |
623 | 5 | create_hashes(columns, &self.random_state, &mut batch_hashes)?0 ; |
624 | 5 | self.input_buffer_hashes.extend(&batch_hashes); |
625 | 5 | // reset row_map for new calculation |
626 | 5 | self.row_map_batch.clear(); |
627 | 5 | // res stores PartitionKey and row indices (indices where these partition occurs in the `batch`) for each partition. |
628 | 5 | let mut result: Vec<(PartitionKey, Vec<u32>)> = vec![]; |
629 | 10 | for (hash, row_idx) in batch_hashes.into_iter().zip(0u32..)5 { |
630 | 10 | let entry = self.row_map_batch.get_mut(hash, |(_, group_idx)| { |
631 | 5 | // We can safely get the first index of the partition indices |
632 | 5 | // since partition indices has one element during initialization. |
633 | 5 | let row = get_row_at_idx(columns, row_idx as usize).unwrap(); |
634 | 5 | // Handle hash collusions with an equality check: |
635 | 5 | row.eq(&result[*group_idx].0) |
636 | 10 | }); |
637 | 10 | if let Some((_, group_idx5 )) = entry { |
638 | 5 | result[*group_idx].1.push(row_idx) |
639 | | } else { |
640 | 5 | self.row_map_batch |
641 | 5 | .insert(hash, (hash, result.len()), |(hash, _)| *hash0 ); |
642 | 5 | let row = get_row_at_idx(columns, row_idx as usize)?0 ; |
643 | | // This is a new partition its only index is row_idx for now. |
644 | 5 | result.push((row, vec![row_idx])); |
645 | | } |
646 | | } |
647 | 5 | Ok(result) |
648 | 5 | } |
649 | | |
650 | | /// Calculates partition keys and result indices for each partition. |
651 | | /// The return value is a vector of tuples where the first entry stores |
652 | | /// the partition key (unique for each partition) and the second entry |
653 | | /// stores indices of the rows for which the partition is constructed. |
654 | 5 | fn calc_partition_output_indices( |
655 | 5 | &mut self, |
656 | 5 | input_buffer: &RecordBatch, |
657 | 5 | window_agg_states: &[PartitionWindowAggStates], |
658 | 5 | window_expr: &[Arc<dyn WindowExpr>], |
659 | 5 | ) -> Result<Vec<(PartitionKey, Vec<u32>)>> { |
660 | 5 | let partition_by_columns = |
661 | 5 | evaluate_partition_by_column_values(input_buffer, window_expr)?0 ; |
662 | | // Reset the row_map state: |
663 | 5 | self.row_map_out.clear(); |
664 | 5 | let mut partition_indices: Vec<(PartitionKey, Vec<u32>)> = vec![]; |
665 | 13 | for (hash, row_idx) in self.input_buffer_hashes.iter().zip(0u32..)5 { |
666 | 13 | let entry = self.row_map_out.get_mut(*hash, |(_, group_idx, _)| { |
667 | 6 | let row = |
668 | 6 | get_row_at_idx(&partition_by_columns, row_idx as usize).unwrap(); |
669 | 6 | row == partition_indices[*group_idx].0 |
670 | 13 | }); |
671 | 13 | if let Some((_, group_idx, n_out6 )) = entry { |
672 | 6 | let (_, indices) = &mut partition_indices[*group_idx]; |
673 | 6 | if indices.len() >= *n_out { |
674 | 2 | break; |
675 | 4 | } |
676 | 4 | indices.push(row_idx); |
677 | | } else { |
678 | 7 | let row = get_row_at_idx(&partition_by_columns, row_idx as usize)?0 ; |
679 | 7 | let min_out = window_agg_states |
680 | 7 | .iter() |
681 | 7 | .map(|window_agg_state| { |
682 | 7 | window_agg_state |
683 | 7 | .get(&row) |
684 | 7 | .map(|partition| partition.state.out_col.len()) |
685 | 7 | .unwrap_or(0) |
686 | 7 | }) |
687 | 7 | .min() |
688 | 7 | .unwrap_or(0); |
689 | 7 | if min_out == 0 { |
690 | 3 | break; |
691 | 4 | } |
692 | 4 | self.row_map_out.insert( |
693 | 4 | *hash, |
694 | 4 | (*hash, partition_indices.len(), min_out), |
695 | 4 | |(hash, _, _)| *hash0 , |
696 | 4 | ); |
697 | 4 | partition_indices.push((row, vec![row_idx])); |
698 | 4 | } |
699 | | } |
700 | 5 | Ok(partition_indices) |
701 | 5 | } |
702 | | } |
703 | | |
704 | | /// This object encapsulates the algorithm state for sorted searching |
705 | | /// when computing partitions. |
706 | | pub struct SortedSearch { |
707 | | /// Stores partition by columns and their ordering information |
708 | | partition_by_sort_keys: Vec<PhysicalSortExpr>, |
709 | | /// Input ordering and partition by key ordering need not be the same, so |
710 | | /// this vector stores the mapping between them. For instance, if the input |
711 | | /// is ordered by a, b and the window expression contains a PARTITION BY b, a |
712 | | /// clause, this attribute stores [1, 0]. |
713 | | ordered_partition_by_indices: Vec<usize>, |
714 | | input_schema: SchemaRef, |
715 | | } |
716 | | |
717 | | impl PartitionSearcher for SortedSearch { |
718 | | /// This method constructs new output columns using the result of each window expression. |
719 | 4 | fn calculate_out_columns( |
720 | 4 | &mut self, |
721 | 4 | _input_buffer: &RecordBatch, |
722 | 4 | window_agg_states: &[PartitionWindowAggStates], |
723 | 4 | partition_buffers: &mut PartitionBatches, |
724 | 4 | _window_expr: &[Arc<dyn WindowExpr>], |
725 | 4 | ) -> Result<Option<Vec<ArrayRef>>> { |
726 | 4 | let n_out = self.calculate_n_out_row(window_agg_states, partition_buffers); |
727 | 4 | if n_out == 0 { |
728 | 1 | Ok(None) |
729 | | } else { |
730 | 3 | window_agg_states |
731 | 3 | .iter() |
732 | 9 | .map(|map| get_aggregate_result_out_column(map, n_out).map(Some)) |
733 | 3 | .collect() |
734 | | } |
735 | 4 | } |
736 | | |
737 | 3 | fn evaluate_partition_batches( |
738 | 3 | &mut self, |
739 | 3 | record_batch: &RecordBatch, |
740 | 3 | _window_expr: &[Arc<dyn WindowExpr>], |
741 | 3 | ) -> Result<Vec<(PartitionKey, RecordBatch)>> { |
742 | 3 | let num_rows = record_batch.num_rows(); |
743 | | // Calculate result of partition by column expressions |
744 | 3 | let partition_columns = self |
745 | 3 | .partition_by_sort_keys |
746 | 3 | .iter() |
747 | 3 | .map(|elem| elem.evaluate_to_sort_column(record_batch)0 ) |
748 | 3 | .collect::<Result<Vec<_>>>()?0 ; |
749 | | // Reorder `partition_columns` such that its ordering matches input ordering. |
750 | 3 | let partition_columns_ordered = |
751 | 3 | get_at_indices(&partition_columns, &self.ordered_partition_by_indices)?0 ; |
752 | 3 | let partition_points = |
753 | 3 | evaluate_partition_ranges(num_rows, &partition_columns_ordered)?0 ; |
754 | 3 | let partition_bys = partition_columns |
755 | 3 | .into_iter() |
756 | 3 | .map(|arr| arr.values0 ) |
757 | 3 | .collect::<Vec<ArrayRef>>(); |
758 | 3 | |
759 | 3 | partition_points |
760 | 3 | .iter() |
761 | 3 | .map(|range| { |
762 | 3 | let row = get_row_at_idx(&partition_bys, range.start)?0 ; |
763 | 3 | let len = range.end - range.start; |
764 | 3 | let slice = record_batch.slice(range.start, len); |
765 | 3 | Ok((row, slice)) |
766 | 3 | }) |
767 | 3 | .collect::<Result<Vec<_>>>() |
768 | 3 | } |
769 | | |
770 | 3 | fn mark_partition_end(&self, partition_buffers: &mut PartitionBatches) { |
771 | 3 | // In Sorted case. We can mark all partitions besides last partition as ended. |
772 | 3 | // We are sure that those partitions will never receive any values. |
773 | 3 | // (Otherwise ordering invariant is violated.) |
774 | 3 | let n_partitions = partition_buffers.len(); |
775 | 3 | for (idx, (_, partition_batch_state)) in partition_buffers.iter_mut().enumerate() |
776 | 3 | { |
777 | 3 | partition_batch_state.is_end |= idx < n_partitions - 1; |
778 | 3 | } |
779 | 3 | } |
780 | | |
781 | 1 | fn input_schema(&self) -> &SchemaRef { |
782 | 1 | &self.input_schema |
783 | 1 | } |
784 | | } |
785 | | |
786 | | impl SortedSearch { |
787 | | /// Calculates how many rows we can output. |
788 | 4 | fn calculate_n_out_row( |
789 | 4 | &mut self, |
790 | 4 | window_agg_states: &[PartitionWindowAggStates], |
791 | 4 | partition_buffers: &mut PartitionBatches, |
792 | 4 | ) -> usize { |
793 | 4 | // Different window aggregators may produce results at different rates. |
794 | 4 | // We produce the overall batch result only as fast as the slowest one. |
795 | 4 | let mut counts = vec![]; |
796 | 12 | let out_col_counts = window_agg_states.iter().map(|window_agg_state| { |
797 | 12 | // Store how many elements are generated for the current |
798 | 12 | // window expression: |
799 | 12 | let mut cur_window_expr_out_result_len = 0; |
800 | 12 | // We iterate over `window_agg_state`, which is an IndexMap. |
801 | 12 | // Iterations follow the insertion order, hence we preserve |
802 | 12 | // sorting when partition columns are sorted. |
803 | 12 | let mut per_partition_out_results = HashMap::new(); |
804 | 12 | for (row, WindowState { state, .. }) in window_agg_state.iter() { |
805 | 12 | cur_window_expr_out_result_len += state.out_col.len(); |
806 | 12 | let count = per_partition_out_results.entry(row).or_insert(0); |
807 | 12 | if *count < state.out_col.len() { |
808 | 9 | *count = state.out_col.len(); |
809 | 9 | }3 |
810 | | // If we do not generate all results for the current |
811 | | // partition, we do not generate results for next |
812 | | // partition -- otherwise we will lose input ordering. |
813 | 12 | if state.n_row_result_missing > 0 { |
814 | 0 | break; |
815 | 12 | } |
816 | | } |
817 | 12 | counts.push(per_partition_out_results); |
818 | 12 | cur_window_expr_out_result_len |
819 | 12 | }); |
820 | 4 | argmin(out_col_counts).map_or(0, |(min_idx, minima)| { |
821 | 4 | for (row, count) in counts.swap_remove(min_idx).into_iter() { |
822 | 4 | let partition_batch = &mut partition_buffers[row]; |
823 | 4 | partition_batch.n_out_row = count; |
824 | 4 | } |
825 | 4 | minima |
826 | 4 | }) |
827 | 4 | } |
828 | | } |
829 | | |
830 | | /// Calculates partition by expression results for each window expression |
831 | | /// on `record_batch`. |
832 | 10 | fn evaluate_partition_by_column_values( |
833 | 10 | record_batch: &RecordBatch, |
834 | 10 | window_expr: &[Arc<dyn WindowExpr>], |
835 | 10 | ) -> Result<Vec<ArrayRef>> { |
836 | 10 | window_expr[0] |
837 | 10 | .partition_by() |
838 | 10 | .iter() |
839 | 10 | .map(|item| match item.evaluate(record_batch)?0 { |
840 | 10 | ColumnarValue::Array(array) => Ok(array), |
841 | 0 | ColumnarValue::Scalar(scalar) => { |
842 | 0 | scalar.to_array_of_size(record_batch.num_rows()) |
843 | | } |
844 | 10 | }) |
845 | 10 | .collect() |
846 | 10 | } |
847 | | |
848 | | /// Stream for the bounded window aggregation plan. |
849 | | pub struct BoundedWindowAggStream { |
850 | | schema: SchemaRef, |
851 | | input: SendableRecordBatchStream, |
852 | | /// The record batch executor receives as input (i.e. the columns needed |
853 | | /// while calculating aggregation results). |
854 | | input_buffer: RecordBatch, |
855 | | /// We separate `input_buffer` based on partitions (as |
856 | | /// determined by PARTITION BY columns) and store them per partition |
857 | | /// in `partition_batches`. We use this variable when calculating results |
858 | | /// for each window expression. This enables us to use the same batch for |
859 | | /// different window expressions without copying. |
860 | | // Note that we could keep record batches for each window expression in |
861 | | // `PartitionWindowAggStates`. However, this would use more memory (as |
862 | | // many times as the number of window expressions). |
863 | | partition_buffers: PartitionBatches, |
864 | | /// An executor can run multiple window expressions if the PARTITION BY |
865 | | /// and ORDER BY sections are same. We keep state of the each window |
866 | | /// expression inside `window_agg_states`. |
867 | | window_agg_states: Vec<PartitionWindowAggStates>, |
868 | | finished: bool, |
869 | | window_expr: Vec<Arc<dyn WindowExpr>>, |
870 | | baseline_metrics: BaselineMetrics, |
871 | | /// Search mode for partition columns. This determines the algorithm with |
872 | | /// which we group each partition. |
873 | | search_mode: Box<dyn PartitionSearcher>, |
874 | | } |
875 | | |
876 | | impl BoundedWindowAggStream { |
877 | | /// Prunes sections of the state that are no longer needed when calculating |
878 | | /// results (as determined by window frame boundaries and number of results generated). |
879 | | // For instance, if first `n` (not necessarily same with `n_out`) elements are no longer needed to |
880 | | // calculate window expression result (outside the window frame boundary) we retract first `n` elements |
881 | | // from `self.partition_batches` in corresponding partition. |
882 | | // For instance, if `n_out` number of rows are calculated, we can remove |
883 | | // first `n_out` rows from `self.input_buffer`. |
884 | 7 | fn prune_state(&mut self, n_out: usize) -> Result<()> { |
885 | 7 | // Prune `self.window_agg_states`: |
886 | 7 | self.prune_out_columns(); |
887 | 7 | // Prune `self.partition_batches`: |
888 | 7 | self.prune_partition_batches(); |
889 | 7 | // Prune `self.input_buffer`: |
890 | 7 | self.prune_input_batch(n_out)?0 ; |
891 | | // Prune internal state of search algorithm. |
892 | 7 | self.search_mode.prune(n_out); |
893 | 7 | Ok(()) |
894 | 7 | } |
895 | | } |
896 | | |
897 | | impl Stream for BoundedWindowAggStream { |
898 | | type Item = Result<RecordBatch>; |
899 | | |
900 | 564 | fn poll_next( |
901 | 564 | mut self: Pin<&mut Self>, |
902 | 564 | cx: &mut Context<'_>, |
903 | 564 | ) -> Poll<Option<Self::Item>> { |
904 | 564 | let poll = self.poll_next_inner(cx); |
905 | 564 | self.baseline_metrics.record_poll(poll) |
906 | 564 | } |
907 | | } |
908 | | |
909 | | impl BoundedWindowAggStream { |
910 | | /// Create a new BoundedWindowAggStream |
911 | 2 | fn new( |
912 | 2 | schema: SchemaRef, |
913 | 2 | window_expr: Vec<Arc<dyn WindowExpr>>, |
914 | 2 | input: SendableRecordBatchStream, |
915 | 2 | baseline_metrics: BaselineMetrics, |
916 | 2 | search_mode: Box<dyn PartitionSearcher>, |
917 | 2 | ) -> Result<Self> { |
918 | 4 | let state = window_expr.iter().map(|_| IndexMap::new()).collect(); |
919 | 2 | let empty_batch = RecordBatch::new_empty(Arc::clone(&schema)); |
920 | 2 | Ok(Self { |
921 | 2 | schema, |
922 | 2 | input, |
923 | 2 | input_buffer: empty_batch, |
924 | 2 | partition_buffers: IndexMap::new(), |
925 | 2 | window_agg_states: state, |
926 | 2 | finished: false, |
927 | 2 | window_expr, |
928 | 2 | baseline_metrics, |
929 | 2 | search_mode, |
930 | 2 | }) |
931 | 2 | } |
932 | | |
933 | 9 | fn compute_aggregates(&mut self) -> Result<RecordBatch> { |
934 | | // calculate window cols |
935 | 17 | for (cur_window_expr, state) in |
936 | 9 | self.window_expr.iter().zip(&mut self.window_agg_states) |
937 | | { |
938 | 17 | cur_window_expr.evaluate_stateful(&self.partition_buffers, state)?0 ; |
939 | | } |
940 | | |
941 | 9 | let schema = Arc::clone(&self.schema); |
942 | 9 | let window_expr_out = self.search_mode.calculate_out_columns( |
943 | 9 | &self.input_buffer, |
944 | 9 | &self.window_agg_states, |
945 | 9 | &mut self.partition_buffers, |
946 | 9 | &self.window_expr, |
947 | 9 | )?0 ; |
948 | 9 | if let Some(window_expr_out7 ) = window_expr_out { |
949 | 7 | let n_out = window_expr_out[0].len(); |
950 | 7 | // right append new columns to corresponding section in the original input buffer. |
951 | 7 | let columns_to_show = self |
952 | 7 | .input_buffer |
953 | 7 | .columns() |
954 | 7 | .iter() |
955 | 11 | .map(|elem| elem.slice(0, n_out)) |
956 | 7 | .chain(window_expr_out) |
957 | 7 | .collect::<Vec<_>>(); |
958 | 7 | let n_generated = columns_to_show[0].len(); |
959 | 7 | self.prune_state(n_generated)?0 ; |
960 | 7 | Ok(RecordBatch::try_new(schema, columns_to_show)?0 ) |
961 | | } else { |
962 | 2 | Ok(RecordBatch::new_empty(schema)) |
963 | | } |
964 | 9 | } |
965 | | |
966 | | #[inline] |
967 | 564 | fn poll_next_inner( |
968 | 564 | &mut self, |
969 | 564 | cx: &mut Context<'_>, |
970 | 564 | ) -> Poll<Option<Result<RecordBatch>>> { |
971 | 564 | if self.finished { |
972 | 1 | return Poll::Ready(None); |
973 | 563 | } |
974 | | |
975 | 563 | let result9 = match ready!554 (self.input.poll_next_unpin(cx)) { |
976 | 8 | Some(Ok(batch)) => { |
977 | 8 | self.search_mode.update_partition_batch( |
978 | 8 | &mut self.input_buffer, |
979 | 8 | batch, |
980 | 8 | &self.window_expr, |
981 | 8 | &mut self.partition_buffers, |
982 | 8 | )?0 ; |
983 | 8 | self.compute_aggregates() |
984 | | } |
985 | 0 | Some(Err(e)) => Err(e), |
986 | | None => { |
987 | 1 | self.finished = true; |
988 | 1 | for (_, partition_batch_state) in self.partition_buffers.iter_mut() { |
989 | 1 | partition_batch_state.is_end = true; |
990 | 1 | } |
991 | 1 | self.compute_aggregates() |
992 | | } |
993 | | }; |
994 | 9 | Poll::Ready(Some(result)) |
995 | 564 | } |
996 | | |
997 | | /// Prunes the sections of the record batch (for each partition) |
998 | | /// that we no longer need to calculate the window function result. |
999 | 7 | fn prune_partition_batches(&mut self) { |
1000 | 7 | // Remove partitions which we know already ended (is_end flag is true). |
1001 | 7 | // Since the retain method preserves insertion order, we still have |
1002 | 7 | // ordering in between partitions after removal. |
1003 | 7 | self.partition_buffers |
1004 | 11 | .retain(|_, partition_batch_state| !partition_batch_state.is_end); |
1005 | 7 | |
1006 | 7 | // The data in `self.partition_batches` is used by all window expressions. |
1007 | 7 | // Therefore, when removing from `self.partition_batches`, we need to remove |
1008 | 7 | // from the earliest range boundary among all window expressions. Variable |
1009 | 7 | // `n_prune_each_partition` fill the earliest range boundary information for |
1010 | 7 | // each partition. This way, we can delete the no-longer-needed sections from |
1011 | 7 | // `self.partition_batches`. |
1012 | 7 | // For instance, if window frame one uses [10, 20] and window frame two uses |
1013 | 7 | // [5, 15]; we only prune the first 5 elements from the corresponding record |
1014 | 7 | // batch in `self.partition_batches`. |
1015 | 7 | |
1016 | 7 | // Calculate how many elements to prune for each partition batch |
1017 | 7 | let mut n_prune_each_partition = HashMap::new(); |
1018 | 13 | for window_agg_state in self.window_agg_states.iter_mut()7 { |
1019 | 17 | window_agg_state.retain(13 |_, WindowState { state, .. }| !state.is_end)13 ; |
1020 | 30 | for (partition_row, WindowState { state: value17 , .. }) in window_agg_state { |
1021 | 17 | let n_prune = |
1022 | 17 | min(value.window_frame_range.start, value.last_calculated_index); |
1023 | 17 | if let Some(current6 ) = n_prune_each_partition.get_mut(partition_row) { |
1024 | 6 | if n_prune < *current { |
1025 | 3 | *current = n_prune; |
1026 | 3 | } |
1027 | 11 | } else { |
1028 | 11 | n_prune_each_partition.insert(partition_row.clone(), n_prune); |
1029 | 11 | } |
1030 | | } |
1031 | | } |
1032 | | |
1033 | | // Retract no longer needed parts during window calculations from partition batch: |
1034 | 11 | for (partition_row, n_prune) in n_prune_each_partition.iter()7 { |
1035 | 11 | let pb_state = &mut self.partition_buffers[partition_row]; |
1036 | 11 | |
1037 | 11 | let batch = &pb_state.record_batch; |
1038 | 11 | pb_state.record_batch = batch.slice(*n_prune, batch.num_rows() - n_prune); |
1039 | 11 | pb_state.n_out_row = 0; |
1040 | | |
1041 | | // Update state indices since we have pruned some rows from the beginning: |
1042 | 17 | for window_agg_state in self.window_agg_states.iter_mut()11 { |
1043 | 17 | window_agg_state[partition_row].state.prune_state(*n_prune); |
1044 | 17 | } |
1045 | | } |
1046 | 7 | } |
1047 | | |
1048 | | /// Prunes the section of the input batch whose aggregate results |
1049 | | /// are calculated and emitted. |
1050 | 7 | fn prune_input_batch(&mut self, n_out: usize) -> Result<()> { |
1051 | 7 | // Prune first n_out rows from the input_buffer |
1052 | 7 | let n_to_keep = self.input_buffer.num_rows() - n_out; |
1053 | 7 | let batch_to_keep = self |
1054 | 7 | .input_buffer |
1055 | 7 | .columns() |
1056 | 7 | .iter() |
1057 | 11 | .map(|elem| elem.slice(n_out, n_to_keep)) |
1058 | 7 | .collect::<Vec<_>>(); |
1059 | 7 | self.input_buffer = RecordBatch::try_new_with_options( |
1060 | 7 | self.input_buffer.schema(), |
1061 | 7 | batch_to_keep, |
1062 | 7 | &RecordBatchOptions::new().with_row_count(Some(n_to_keep)), |
1063 | 7 | )?0 ; |
1064 | 7 | Ok(()) |
1065 | 7 | } |
1066 | | |
1067 | | /// Prunes emitted parts from WindowAggState `out_col` field. |
1068 | 7 | fn prune_out_columns(&mut self) { |
1069 | | // We store generated columns for each window expression in the `out_col` |
1070 | | // field of `WindowAggState`. Given how many rows are emitted, we remove |
1071 | | // these sections from state. |
1072 | 13 | for partition_window_agg_states in self.window_agg_states.iter_mut()7 { |
1073 | | // Remove `n_out` entries from the `out_col` field of `WindowAggState`. |
1074 | | // `n_out` is stored in `self.partition_buffers` for each partition. |
1075 | | // If `is_end` is set, directly remove them; this shrinks the hash map. |
1076 | 13 | partition_window_agg_states |
1077 | 17 | .retain(|_, partition_batch_state| !partition_batch_state.state.is_end)13 ; |
1078 | | for ( |
1079 | 17 | partition_key, |
1080 | 17 | WindowState { |
1081 | 17 | state: WindowAggState { out_col, .. }, |
1082 | | .. |
1083 | | }, |
1084 | 30 | ) in partition_window_agg_states |
1085 | 17 | { |
1086 | 17 | let partition_batch = &mut self.partition_buffers[partition_key]; |
1087 | 17 | let n_to_del = partition_batch.n_out_row; |
1088 | 17 | let n_to_keep = out_col.len() - n_to_del; |
1089 | 17 | *out_col = out_col.slice(n_to_del, n_to_keep); |
1090 | 17 | } |
1091 | | } |
1092 | 7 | } |
1093 | | } |
1094 | | |
1095 | | impl RecordBatchStream for BoundedWindowAggStream { |
1096 | | /// Get the schema |
1097 | 0 | fn schema(&self) -> SchemaRef { |
1098 | 0 | Arc::clone(&self.schema) |
1099 | 0 | } |
1100 | | } |
1101 | | |
1102 | | // Gets the index of minimum entry, returns None if empty. |
1103 | 4 | fn argmin<T: PartialOrd>(data: impl Iterator<Item = T>) -> Option<(usize, T)> { |
1104 | 4 | data.enumerate() |
1105 | 8 | .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(Ordering::Equal)) |
1106 | 4 | } |
1107 | | |
1108 | | /// Calculates the section we can show results for expression |
1109 | 9 | fn get_aggregate_result_out_column( |
1110 | 9 | partition_window_agg_states: &PartitionWindowAggStates, |
1111 | 9 | len_to_show: usize, |
1112 | 9 | ) -> Result<ArrayRef> { |
1113 | 9 | let mut result = None; |
1114 | 9 | let mut running_length = 0; |
1115 | | // We assume that iteration order is according to insertion order |
1116 | | for ( |
1117 | | _, |
1118 | | WindowState { |
1119 | 9 | state: WindowAggState { out_col, .. }, |
1120 | | .. |
1121 | | }, |
1122 | 18 | ) in partition_window_agg_states |
1123 | | { |
1124 | 9 | if running_length < len_to_show { |
1125 | 9 | let n_to_use = min(len_to_show - running_length, out_col.len()); |
1126 | 9 | let slice_to_use = out_col.slice(0, n_to_use); |
1127 | 9 | result = Some(match result { |
1128 | 0 | Some(arr) => concat(&[&arr, &slice_to_use])?, |
1129 | 9 | None => slice_to_use, |
1130 | | }); |
1131 | 9 | running_length += n_to_use; |
1132 | | } else { |
1133 | 0 | break; |
1134 | | } |
1135 | | } |
1136 | 9 | if running_length != len_to_show { |
1137 | 0 | return exec_err!( |
1138 | 0 | "Generated row number should be {len_to_show}, it is {running_length}" |
1139 | 0 | ); |
1140 | 9 | } |
1141 | 9 | result |
1142 | 9 | .ok_or_else(|| DataFusionError::Execution("Should contain something".to_string())0 ) |
1143 | 9 | } |
1144 | | |
1145 | | /// Constructs a batch from the last row of batch in the argument. |
1146 | 5 | pub(crate) fn get_last_row_batch(batch: &RecordBatch) -> Result<RecordBatch> { |
1147 | 5 | if batch.num_rows() == 0 { |
1148 | 0 | return exec_err!("Latest batch should have at least 1 row"); |
1149 | 5 | } |
1150 | 5 | Ok(batch.slice(batch.num_rows() - 1, 1)) |
1151 | 5 | } |
1152 | | |
1153 | | #[cfg(test)] |
1154 | | mod tests { |
1155 | | use std::pin::Pin; |
1156 | | use std::sync::Arc; |
1157 | | use std::task::{Context, Poll}; |
1158 | | use std::time::Duration; |
1159 | | |
1160 | | use crate::common::collect; |
1161 | | use crate::memory::MemoryExec; |
1162 | | use crate::projection::ProjectionExec; |
1163 | | use crate::streaming::{PartitionStream, StreamingTableExec}; |
1164 | | use crate::windows::{create_window_expr, BoundedWindowAggExec, InputOrderMode}; |
1165 | | use crate::{execute_stream, get_plan_string, ExecutionPlan}; |
1166 | | |
1167 | | use arrow_array::builder::{Int64Builder, UInt64Builder}; |
1168 | | use arrow_array::RecordBatch; |
1169 | | use arrow_schema::{DataType, Field, Schema, SchemaRef, SortOptions}; |
1170 | | use datafusion_common::{ |
1171 | | assert_batches_eq, exec_datafusion_err, Result, ScalarValue, |
1172 | | }; |
1173 | | use datafusion_execution::config::SessionConfig; |
1174 | | use datafusion_execution::{ |
1175 | | RecordBatchStream, SendableRecordBatchStream, TaskContext, |
1176 | | }; |
1177 | | use datafusion_expr::{ |
1178 | | WindowFrame, WindowFrameBound, WindowFrameUnits, WindowFunctionDefinition, |
1179 | | }; |
1180 | | use datafusion_functions_aggregate::count::count_udaf; |
1181 | | use datafusion_physical_expr::expressions::{col, Column, NthValue}; |
1182 | | use datafusion_physical_expr::window::{ |
1183 | | BuiltInWindowExpr, BuiltInWindowFunctionExpr, |
1184 | | }; |
1185 | | use datafusion_physical_expr::{LexOrdering, PhysicalExpr, PhysicalSortExpr}; |
1186 | | |
1187 | | use futures::future::Shared; |
1188 | | use futures::{pin_mut, ready, FutureExt, Stream, StreamExt}; |
1189 | | use itertools::Itertools; |
1190 | | use tokio::time::timeout; |
1191 | | |
1192 | | #[derive(Debug, Clone)] |
1193 | | struct TestStreamPartition { |
1194 | | schema: SchemaRef, |
1195 | | batches: Vec<RecordBatch>, |
1196 | | idx: usize, |
1197 | | state: PolingState, |
1198 | | sleep_duration: Duration, |
1199 | | send_exit: bool, |
1200 | | } |
1201 | | |
1202 | | impl PartitionStream for TestStreamPartition { |
1203 | 1 | fn schema(&self) -> &SchemaRef { |
1204 | 1 | &self.schema |
1205 | 1 | } |
1206 | | |
1207 | 1 | fn execute(&self, _ctx: Arc<TaskContext>) -> SendableRecordBatchStream { |
1208 | 1 | // We create an iterator from the record batches and map them into Ok values, |
1209 | 1 | // converting the iterator into a futures::stream::Stream |
1210 | 1 | Box::pin(self.clone()) |
1211 | 1 | } |
1212 | | } |
1213 | | |
1214 | | impl Stream for TestStreamPartition { |
1215 | | type Item = Result<RecordBatch>; |
1216 | | |
1217 | 559 | fn poll_next( |
1218 | 559 | mut self: Pin<&mut Self>, |
1219 | 559 | cx: &mut Context<'_>, |
1220 | 559 | ) -> Poll<Option<Self::Item>> { |
1221 | 559 | self.poll_next_inner(cx) |
1222 | 559 | } |
1223 | | } |
1224 | | |
1225 | | #[derive(Debug, Clone)] |
1226 | | enum PolingState { |
1227 | | Sleep(Shared<futures::future::BoxFuture<'static, ()>>), |
1228 | | BatchReturn, |
1229 | | } |
1230 | | |
1231 | | impl TestStreamPartition { |
1232 | 559 | fn poll_next_inner( |
1233 | 559 | self: &mut Pin<&mut Self>, |
1234 | 559 | cx: &mut Context<'_>, |
1235 | 559 | ) -> Poll<Option<Result<RecordBatch>>> { |
1236 | | loop { |
1237 | 1.10k | match &mut self.state { |
1238 | | PolingState::BatchReturn => { |
1239 | | // Wait for self.sleep_duration before sending any new data |
1240 | 277 | let f = tokio::time::sleep(self.sleep_duration).boxed().shared(); |
1241 | 277 | self.state = PolingState::Sleep(f); |
1242 | 5 | let input_batch = if let Some(batch) = |
1243 | 277 | self.batches.clone().get(self.idx) |
1244 | | { |
1245 | 5 | batch.clone() |
1246 | 272 | } else if self.send_exit { |
1247 | | // Send None to signal end of data |
1248 | 0 | return Poll::Ready(None); |
1249 | | } else { |
1250 | | // Go to sleep mode |
1251 | 272 | let f = |
1252 | 272 | tokio::time::sleep(self.sleep_duration).boxed().shared(); |
1253 | 272 | self.state = PolingState::Sleep(f); |
1254 | 272 | continue; |
1255 | | }; |
1256 | 5 | self.idx += 1; |
1257 | 5 | return Poll::Ready(Some(Ok(input_batch))); |
1258 | | } |
1259 | 830 | PolingState::Sleep(future) => { |
1260 | 830 | pin_mut!(future); |
1261 | 830 | ready!554 (future.poll_unpin(cx)); |
1262 | 276 | self.state = PolingState::BatchReturn; |
1263 | | } |
1264 | | } |
1265 | | } |
1266 | 559 | } |
1267 | | } |
1268 | | |
1269 | | impl RecordBatchStream for TestStreamPartition { |
1270 | 0 | fn schema(&self) -> SchemaRef { |
1271 | 0 | Arc::clone(&self.schema) |
1272 | 0 | } |
1273 | | } |
1274 | | |
1275 | 1 | fn bounded_window_exec_pb_latent_range( |
1276 | 1 | input: Arc<dyn ExecutionPlan>, |
1277 | 1 | n_future_range: usize, |
1278 | 1 | hash: &str, |
1279 | 1 | order_by: &str, |
1280 | 1 | ) -> Result<Arc<dyn ExecutionPlan>> { |
1281 | 1 | let schema = input.schema(); |
1282 | 1 | let window_fn = WindowFunctionDefinition::AggregateUDF(count_udaf()); |
1283 | 1 | let col_expr = |
1284 | 1 | Arc::new(Column::new(schema.fields[0].name(), 0)) as Arc<dyn PhysicalExpr>; |
1285 | 1 | let args = vec![col_expr]; |
1286 | 1 | let partitionby_exprs = vec![col(hash, &schema)?0 ]; |
1287 | 1 | let orderby_exprs = vec![PhysicalSortExpr { |
1288 | 1 | expr: col(order_by, &schema)?0 , |
1289 | 1 | options: SortOptions::default(), |
1290 | 1 | }]; |
1291 | 1 | let window_frame = WindowFrame::new_bounds( |
1292 | 1 | WindowFrameUnits::Range, |
1293 | 1 | WindowFrameBound::CurrentRow, |
1294 | 1 | WindowFrameBound::Following(ScalarValue::UInt64(Some(n_future_range as u64))), |
1295 | 1 | ); |
1296 | 1 | let fn_name = format!( |
1297 | 1 | "{}({:?}) PARTITION BY: [{:?}], ORDER BY: [{:?}]", |
1298 | 1 | window_fn, args, partitionby_exprs, orderby_exprs |
1299 | 1 | ); |
1300 | 1 | let input_order_mode = InputOrderMode::Linear; |
1301 | 1 | Ok(Arc::new(BoundedWindowAggExec::try_new( |
1302 | 1 | vec![create_window_expr( |
1303 | 1 | &window_fn, |
1304 | 1 | fn_name, |
1305 | 1 | &args, |
1306 | 1 | &partitionby_exprs, |
1307 | 1 | &orderby_exprs, |
1308 | 1 | Arc::new(window_frame), |
1309 | 1 | &input.schema(), |
1310 | 1 | false, |
1311 | 1 | )?0 ], |
1312 | 1 | input, |
1313 | 1 | partitionby_exprs, |
1314 | 1 | input_order_mode, |
1315 | 0 | )?)) |
1316 | 1 | } |
1317 | | |
1318 | 1 | fn projection_exec(input: Arc<dyn ExecutionPlan>) -> Result<Arc<dyn ExecutionPlan>> { |
1319 | 1 | let schema = input.schema(); |
1320 | 1 | let exprs = input |
1321 | 1 | .schema() |
1322 | 1 | .fields |
1323 | 1 | .iter() |
1324 | 1 | .enumerate() |
1325 | 3 | .map(|(idx, field)| { |
1326 | 3 | let name = if field.name().len() > 20 { |
1327 | 1 | format!("col_{idx}") |
1328 | | } else { |
1329 | 2 | field.name().clone() |
1330 | | }; |
1331 | 3 | let expr = col(field.name(), &schema).unwrap(); |
1332 | 3 | (expr, name) |
1333 | 3 | }) |
1334 | 1 | .collect::<Vec<_>>(); |
1335 | 1 | Ok(Arc::new(ProjectionExec::try_new(exprs, input)?0 )) |
1336 | 1 | } |
1337 | | |
1338 | 1 | fn task_context_helper() -> TaskContext { |
1339 | 1 | let task_ctx = TaskContext::default(); |
1340 | 1 | // Create session context with config |
1341 | 1 | let session_config = SessionConfig::new() |
1342 | 1 | .with_batch_size(1) |
1343 | 1 | .with_target_partitions(2) |
1344 | 1 | .with_round_robin_repartition(false); |
1345 | 1 | task_ctx.with_session_config(session_config) |
1346 | 1 | } |
1347 | | |
1348 | 1 | fn task_context() -> Arc<TaskContext> { |
1349 | 1 | Arc::new(task_context_helper()) |
1350 | 1 | } |
1351 | | |
1352 | 1 | pub async fn collect_stream( |
1353 | 1 | mut stream: SendableRecordBatchStream, |
1354 | 1 | results: &mut Vec<RecordBatch>, |
1355 | 1 | ) -> Result<()> { |
1356 | 553 | while let Some(item5 ) = stream.next()6 .await { |
1357 | 5 | results.push(item?0 ); |
1358 | | } |
1359 | 0 | Ok(()) |
1360 | 0 | } |
1361 | | |
1362 | | /// Execute the [ExecutionPlan] and collect the results in memory |
1363 | 1 | pub async fn collect_with_timeout( |
1364 | 1 | plan: Arc<dyn ExecutionPlan>, |
1365 | 1 | context: Arc<TaskContext>, |
1366 | 1 | timeout_duration: Duration, |
1367 | 1 | ) -> Result<Vec<RecordBatch>> { |
1368 | 1 | let stream = execute_stream(plan, context)?0 ; |
1369 | 1 | let mut results = vec![]; |
1370 | 1 | |
1371 | 1 | // Execute the asynchronous operation with a timeout |
1372 | 1 | if timeout(timeout_duration, collect_stream(stream, &mut results)) |
1373 | 553 | .await |
1374 | 1 | .is_ok() |
1375 | | { |
1376 | 0 | return Err(exec_datafusion_err!("shouldn't have completed")); |
1377 | 1 | }; |
1378 | 1 | |
1379 | 1 | Ok(results) |
1380 | 1 | } |
1381 | | |
1382 | | /// Execute the [ExecutionPlan] and collect the results in memory |
1383 | | #[allow(dead_code)] |
1384 | 0 | pub async fn collect_bonafide( |
1385 | 0 | plan: Arc<dyn ExecutionPlan>, |
1386 | 0 | context: Arc<TaskContext>, |
1387 | 0 | ) -> Result<Vec<RecordBatch>> { |
1388 | 0 | let stream = execute_stream(plan, context)?; |
1389 | 0 | let mut results = vec![]; |
1390 | 0 |
|
1391 | 0 | collect_stream(stream, &mut results).await?; |
1392 | | |
1393 | 0 | Ok(results) |
1394 | 0 | } |
1395 | | |
1396 | 1 | fn test_schema() -> SchemaRef { |
1397 | 1 | Arc::new(Schema::new(vec![ |
1398 | 1 | Field::new("sn", DataType::UInt64, true), |
1399 | 1 | Field::new("hash", DataType::Int64, true), |
1400 | 1 | ])) |
1401 | 1 | } |
1402 | | |
1403 | 1 | fn schema_orders(schema: &SchemaRef) -> Result<Vec<LexOrdering>> { |
1404 | 1 | let orderings = vec![vec![PhysicalSortExpr { |
1405 | 1 | expr: col("sn", schema)?0 , |
1406 | 1 | options: SortOptions { |
1407 | 1 | descending: false, |
1408 | 1 | nulls_first: false, |
1409 | 1 | }, |
1410 | 1 | }]]; |
1411 | 1 | Ok(orderings) |
1412 | 1 | } |
1413 | | |
1414 | 1 | fn is_integer_division_safe(lhs: usize, rhs: usize) -> bool { |
1415 | 1 | let res = lhs / rhs; |
1416 | 1 | res * rhs == lhs |
1417 | 1 | } |
1418 | 1 | fn generate_batches( |
1419 | 1 | schema: &SchemaRef, |
1420 | 1 | n_row: usize, |
1421 | 1 | n_chunk: usize, |
1422 | 1 | ) -> Result<Vec<RecordBatch>> { |
1423 | 1 | let mut batches = vec![]; |
1424 | 1 | assert!(n_row > 0); |
1425 | 1 | assert!(n_chunk > 0); |
1426 | 1 | assert!(is_integer_division_safe(n_row, n_chunk)); |
1427 | 1 | let hash_replicate = 4; |
1428 | 1 | |
1429 | 1 | let chunks = (0..n_row) |
1430 | 1 | .chunks(n_chunk) |
1431 | 1 | .into_iter() |
1432 | 5 | .map(|elem| elem.into_iter().collect::<Vec<_>>()) |
1433 | 1 | .collect::<Vec<_>>(); |
1434 | | |
1435 | | // Send 2 RecordBatches at the source |
1436 | 6 | for sn_values5 in chunks { |
1437 | 5 | let mut sn1_array = UInt64Builder::with_capacity(sn_values.len()); |
1438 | 5 | let mut hash_array = Int64Builder::with_capacity(sn_values.len()); |
1439 | | |
1440 | 15 | for sn10 in sn_values { |
1441 | 10 | sn1_array.append_value(sn as u64); |
1442 | 10 | let hash_value = (2 - (sn / hash_replicate)) as i64; |
1443 | 10 | hash_array.append_value(hash_value); |
1444 | 10 | } |
1445 | | |
1446 | 5 | let batch = RecordBatch::try_new( |
1447 | 5 | Arc::clone(schema), |
1448 | 5 | vec![Arc::new(sn1_array.finish()), Arc::new(hash_array.finish())], |
1449 | 5 | )?0 ; |
1450 | 5 | batches.push(batch); |
1451 | | } |
1452 | 1 | Ok(batches) |
1453 | 1 | } |
1454 | | |
1455 | 1 | fn generate_never_ending_source( |
1456 | 1 | n_rows: usize, |
1457 | 1 | chunk_length: usize, |
1458 | 1 | n_partition: usize, |
1459 | 1 | is_infinite: bool, |
1460 | 1 | send_exit: bool, |
1461 | 1 | per_batch_wait_duration_in_millis: u64, |
1462 | 1 | ) -> Result<Arc<dyn ExecutionPlan>> { |
1463 | 1 | assert!(n_partition > 0); |
1464 | | |
1465 | | // We use same hash value in the table. This makes sure that |
1466 | | // After hashing computation will continue in only in one of the output partitions |
1467 | | // In this case, data flow should still continue |
1468 | 1 | let schema = test_schema(); |
1469 | 1 | let orderings = schema_orders(&schema)?0 ; |
1470 | | |
1471 | | // Source waits per_batch_wait_duration_in_millis ms before sending other batch |
1472 | 1 | let per_batch_wait_duration = |
1473 | 1 | Duration::from_millis(per_batch_wait_duration_in_millis); |
1474 | | |
1475 | 1 | let batches = generate_batches(&schema, n_rows, chunk_length)?0 ; |
1476 | | |
1477 | | // Source has 2 partitions |
1478 | 1 | let partitions = vec![ |
1479 | 1 | Arc::new(TestStreamPartition { |
1480 | 1 | schema: Arc::clone(&schema), |
1481 | 1 | batches, |
1482 | 1 | idx: 0, |
1483 | 1 | state: PolingState::BatchReturn, |
1484 | 1 | sleep_duration: per_batch_wait_duration, |
1485 | 1 | send_exit, |
1486 | 1 | }) as _; |
1487 | 1 | n_partition |
1488 | 1 | ]; |
1489 | 1 | let source = Arc::new(StreamingTableExec::try_new( |
1490 | 1 | Arc::clone(&schema), |
1491 | 1 | partitions, |
1492 | 1 | None, |
1493 | 1 | orderings, |
1494 | 1 | is_infinite, |
1495 | 1 | None, |
1496 | 1 | )?0 ) as _; |
1497 | 1 | Ok(source) |
1498 | 1 | } |
1499 | | |
1500 | | // Tests NTH_VALUE(negative index) with memoize feature. |
1501 | | // To be able to trigger memoize feature for NTH_VALUE we need to |
1502 | | // - feed BoundedWindowAggExec with batch stream data. |
1503 | | // - Window frame should contain UNBOUNDED PRECEDING. |
1504 | | // It hard to ensure these conditions are met, from the sql query. |
1505 | | #[tokio::test] |
1506 | 1 | async fn test_window_nth_value_bounded_memoize() -> Result<()> { |
1507 | 1 | let config = SessionConfig::new().with_target_partitions(1); |
1508 | 1 | let task_ctx = Arc::new(TaskContext::default().with_session_config(config)); |
1509 | 1 | |
1510 | 1 | let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); |
1511 | 1 | // Create a new batch of data to insert into the table |
1512 | 1 | let batch = RecordBatch::try_new( |
1513 | 1 | Arc::clone(&schema), |
1514 | 1 | vec![Arc::new(arrow_array::Int32Array::from(vec![1, 2, 3]))], |
1515 | 1 | )?0 ; |
1516 | 1 | |
1517 | 1 | let memory_exec = MemoryExec::try_new( |
1518 | 1 | &[vec![batch.clone(), batch.clone(), batch.clone()]], |
1519 | 1 | Arc::clone(&schema), |
1520 | 1 | None, |
1521 | 1 | ) |
1522 | 1 | .map(|e| Arc::new(e) as Arc<dyn ExecutionPlan>)?0 ; |
1523 | 1 | let col_a = col("a", &schema)?0 ; |
1524 | 1 | let nth_value_func1 = NthValue::nth( |
1525 | 1 | "nth_value(-1)", |
1526 | 1 | Arc::clone(&col_a), |
1527 | 1 | DataType::Int32, |
1528 | 1 | 1, |
1529 | 1 | false, |
1530 | 1 | )?0 |
1531 | 1 | .reverse_expr() |
1532 | 1 | .unwrap(); |
1533 | 1 | let nth_value_func2 = NthValue::nth( |
1534 | 1 | "nth_value(-2)", |
1535 | 1 | Arc::clone(&col_a), |
1536 | 1 | DataType::Int32, |
1537 | 1 | 2, |
1538 | 1 | false, |
1539 | 1 | )?0 |
1540 | 1 | .reverse_expr() |
1541 | 1 | .unwrap(); |
1542 | 1 | let last_value_func = Arc::new(NthValue::last( |
1543 | 1 | "last", |
1544 | 1 | Arc::clone(&col_a), |
1545 | 1 | DataType::Int32, |
1546 | 1 | false, |
1547 | 1 | )) as _; |
1548 | 1 | let window_exprs = vec![ |
1549 | 1 | // LAST_VALUE(a) |
1550 | 1 | Arc::new(BuiltInWindowExpr::new( |
1551 | 1 | last_value_func, |
1552 | 1 | &[], |
1553 | 1 | &[], |
1554 | 1 | Arc::new(WindowFrame::new_bounds( |
1555 | 1 | WindowFrameUnits::Rows, |
1556 | 1 | WindowFrameBound::Preceding(ScalarValue::UInt64(None)), |
1557 | 1 | WindowFrameBound::CurrentRow, |
1558 | 1 | )), |
1559 | 1 | )) as _, |
1560 | 1 | // NTH_VALUE(a, -1) |
1561 | 1 | Arc::new(BuiltInWindowExpr::new( |
1562 | 1 | nth_value_func1, |
1563 | 1 | &[], |
1564 | 1 | &[], |
1565 | 1 | Arc::new(WindowFrame::new_bounds( |
1566 | 1 | WindowFrameUnits::Rows, |
1567 | 1 | WindowFrameBound::Preceding(ScalarValue::UInt64(None)), |
1568 | 1 | WindowFrameBound::CurrentRow, |
1569 | 1 | )), |
1570 | 1 | )) as _, |
1571 | 1 | // NTH_VALUE(a, -2) |
1572 | 1 | Arc::new(BuiltInWindowExpr::new( |
1573 | 1 | nth_value_func2, |
1574 | 1 | &[], |
1575 | 1 | &[], |
1576 | 1 | Arc::new(WindowFrame::new_bounds( |
1577 | 1 | WindowFrameUnits::Rows, |
1578 | 1 | WindowFrameBound::Preceding(ScalarValue::UInt64(None)), |
1579 | 1 | WindowFrameBound::CurrentRow, |
1580 | 1 | )), |
1581 | 1 | )) as _, |
1582 | 1 | ]; |
1583 | 1 | let physical_plan = BoundedWindowAggExec::try_new( |
1584 | 1 | window_exprs, |
1585 | 1 | memory_exec, |
1586 | 1 | vec![], |
1587 | 1 | InputOrderMode::Sorted, |
1588 | 1 | ) |
1589 | 1 | .map(|e| Arc::new(e) as Arc<dyn ExecutionPlan>)?0 ; |
1590 | 1 | |
1591 | 1 | let batches = collect(physical_plan.execute(0, task_ctx)?0 ).await0 ?0 ; |
1592 | 1 | |
1593 | 1 | let expected = vec![ |
1594 | 1 | "BoundedWindowAggExec: wdw=[last: Ok(Field { name: \"last\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-1): Ok(Field { name: \"nth_value(-1)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }, nth_value(-2): Ok(Field { name: \"nth_value(-2)\", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: CurrentRow, is_causal: true }], mode=[Sorted]", |
1595 | 1 | " MemoryExec: partitions=1, partition_sizes=[3]", |
1596 | 1 | ]; |
1597 | 1 | // Get string representation of the plan |
1598 | 1 | let actual = get_plan_string(&physical_plan); |
1599 | 1 | assert_eq!( |
1600 | 1 | expected, actual, |
1601 | 1 | "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n"0 |
1602 | 1 | ); |
1603 | 1 | |
1604 | 1 | let expected = [ |
1605 | 1 | "+---+------+---------------+---------------+", |
1606 | 1 | "| a | last | nth_value(-1) | nth_value(-2) |", |
1607 | 1 | "+---+------+---------------+---------------+", |
1608 | 1 | "| 1 | 1 | 1 | |", |
1609 | 1 | "| 2 | 2 | 2 | 1 |", |
1610 | 1 | "| 3 | 3 | 3 | 2 |", |
1611 | 1 | "| 1 | 1 | 1 | 3 |", |
1612 | 1 | "| 2 | 2 | 2 | 1 |", |
1613 | 1 | "| 3 | 3 | 3 | 2 |", |
1614 | 1 | "| 1 | 1 | 1 | 3 |", |
1615 | 1 | "| 2 | 2 | 2 | 1 |", |
1616 | 1 | "| 3 | 3 | 3 | 2 |", |
1617 | 1 | "+---+------+---------------+---------------+", |
1618 | 1 | ]; |
1619 | 1 | assert_batches_eq!(expected, &batches); |
1620 | 1 | Ok(()) |
1621 | 1 | } |
1622 | | |
1623 | | // This test, tests whether most recent row guarantee by the input batch of the `BoundedWindowAggExec` |
1624 | | // helps `BoundedWindowAggExec` to generate low latency result in the `Linear` mode. |
1625 | | // Input data generated at the source is |
1626 | | // "+----+------+", |
1627 | | // "| sn | hash |", |
1628 | | // "+----+------+", |
1629 | | // "| 0 | 2 |", |
1630 | | // "| 1 | 2 |", |
1631 | | // "| 2 | 2 |", |
1632 | | // "| 3 | 2 |", |
1633 | | // "| 4 | 1 |", |
1634 | | // "| 5 | 1 |", |
1635 | | // "| 6 | 1 |", |
1636 | | // "| 7 | 1 |", |
1637 | | // "| 8 | 0 |", |
1638 | | // "| 9 | 0 |", |
1639 | | // "+----+------+", |
1640 | | // |
1641 | | // Effectively following query is run on this data |
1642 | | // |
1643 | | // SELECT *, count(*) OVER(PARTITION BY duplicated_hash ORDER BY sn RANGE BETWEEN CURRENT ROW AND 1 FOLLOWING) |
1644 | | // FROM test; |
1645 | | // |
1646 | | // partition `duplicated_hash=2` receives following data from the input |
1647 | | // |
1648 | | // "+----+------+", |
1649 | | // "| sn | hash |", |
1650 | | // "+----+------+", |
1651 | | // "| 0 | 2 |", |
1652 | | // "| 1 | 2 |", |
1653 | | // "| 2 | 2 |", |
1654 | | // "| 3 | 2 |", |
1655 | | // "+----+------+", |
1656 | | // normally `BoundedWindowExec` can only generate following result from the input above |
1657 | | // |
1658 | | // "+----+------+---------+", |
1659 | | // "| sn | hash | count |", |
1660 | | // "+----+------+---------+", |
1661 | | // "| 0 | 2 | 2 |", |
1662 | | // "| 1 | 2 | 2 |", |
1663 | | // "| 2 | 2 |<not yet>|", |
1664 | | // "| 3 | 2 |<not yet>|", |
1665 | | // "+----+------+---------+", |
1666 | | // where result of last 2 row is missing. Since window frame end is not may change with future data |
1667 | | // since window frame end is determined by 1 following (To generate result for row=3[where sn=2] we |
1668 | | // need to received sn=4 to make sure window frame end bound won't change with future data). |
1669 | | // |
1670 | | // With the ability of different partitions to use global ordering at the input (where most up-to date |
1671 | | // row is |
1672 | | // "| 9 | 0 |", |
1673 | | // ) |
1674 | | // |
1675 | | // `BoundedWindowExec` should be able to generate following result in the test |
1676 | | // |
1677 | | // "+----+------+-------+", |
1678 | | // "| sn | hash | col_2 |", |
1679 | | // "+----+------+-------+", |
1680 | | // "| 0 | 2 | 2 |", |
1681 | | // "| 1 | 2 | 2 |", |
1682 | | // "| 2 | 2 | 2 |", |
1683 | | // "| 3 | 2 | 1 |", |
1684 | | // "| 4 | 1 | 2 |", |
1685 | | // "| 5 | 1 | 2 |", |
1686 | | // "| 6 | 1 | 2 |", |
1687 | | // "| 7 | 1 | 1 |", |
1688 | | // "+----+------+-------+", |
1689 | | // |
1690 | | // where result for all rows except last 2 is calculated (To calculate result for row 9 where sn=8 |
1691 | | // we need to receive sn=10 value to calculate it result.). |
1692 | | // In this test, out aim is to test for which portion of the input data `BoundedWindowExec` can generate |
1693 | | // a result. To test this behaviour, we generated the data at the source infinitely (no `None` signal |
1694 | | // is sent to output from source). After, row: |
1695 | | // |
1696 | | // "| 9 | 0 |", |
1697 | | // |
1698 | | // is sent. Source stops sending data to output. We collect, result emitted by the `BoundedWindowExec` at the |
1699 | | // end of the pipeline with a timeout (Since no `None` is sent from source. Collection never ends otherwise). |
1700 | | #[tokio::test] |
1701 | 1 | async fn bounded_window_exec_linear_mode_range_information() -> Result<()> { |
1702 | 1 | let n_rows = 10; |
1703 | 1 | let chunk_length = 2; |
1704 | 1 | let n_future_range = 1; |
1705 | 1 | |
1706 | 1 | let timeout_duration = Duration::from_millis(2000); |
1707 | 1 | |
1708 | 1 | let source = |
1709 | 1 | generate_never_ending_source(n_rows, chunk_length, 1, true, false, 5)?0 ; |
1710 | 1 | |
1711 | 1 | let window = |
1712 | 1 | bounded_window_exec_pb_latent_range(source, n_future_range, "hash", "sn")?0 ; |
1713 | 1 | |
1714 | 1 | let plan = projection_exec(window)?0 ; |
1715 | 1 | |
1716 | 1 | let expected_plan = vec![ |
1717 | 1 | "ProjectionExec: expr=[sn@0 as sn, hash@1 as hash, count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]@2 as col_2]", |
1718 | 1 | " BoundedWindowAggExec: wdw=[count([Column { name: \"sn\", index: 0 }]) PARTITION BY: [[Column { name: \"hash\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \"sn\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]: Ok(Field { name: \"count([Column { name: \\\"sn\\\", index: 0 }]) PARTITION BY: [[Column { name: \\\"hash\\\", index: 1 }]], ORDER BY: [[PhysicalSortExpr { expr: Column { name: \\\"sn\\\", index: 0 }, options: SortOptions { descending: false, nulls_first: true } }]]\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: CurrentRow, end_bound: Following(UInt64(1)), is_causal: false }], mode=[Linear]", |
1719 | 1 | " StreamingTableExec: partition_sizes=1, projection=[sn, hash], infinite_source=true, output_ordering=[sn@0 ASC NULLS LAST]", |
1720 | 1 | ]; |
1721 | 1 | |
1722 | 1 | // Get string representation of the plan |
1723 | 1 | let actual = get_plan_string(&plan); |
1724 | 1 | assert_eq!( |
1725 | 1 | expected_plan, actual, |
1726 | 1 | "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected_plan:#?}\nactual:\n\n{actual:#?}\n\n"0 |
1727 | 1 | ); |
1728 | 1 | |
1729 | 1 | let task_ctx = task_context(); |
1730 | 553 | let batches1 = collect_with_timeout(plan, task_ctx, timeout_duration)1 .await?0 ; |
1731 | 1 | |
1732 | 1 | let expected = [ |
1733 | 1 | "+----+------+-------+", |
1734 | 1 | "| sn | hash | col_2 |", |
1735 | 1 | "+----+------+-------+", |
1736 | 1 | "| 0 | 2 | 2 |", |
1737 | 1 | "| 1 | 2 | 2 |", |
1738 | 1 | "| 2 | 2 | 2 |", |
1739 | 1 | "| 3 | 2 | 1 |", |
1740 | 1 | "| 4 | 1 | 2 |", |
1741 | 1 | "| 5 | 1 | 2 |", |
1742 | 1 | "| 6 | 1 | 2 |", |
1743 | 1 | "| 7 | 1 | 1 |", |
1744 | 1 | "+----+------+-------+", |
1745 | 1 | ]; |
1746 | 1 | assert_batches_eq!(expected, &batches); |
1747 | 1 | |
1748 | 1 | Ok(()) |
1749 | 1 | } |
1750 | | } |