Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/sorts/partial_sort.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Partial Sort deals with input data that partially
19
//! satisfies the required sort order. Such an input data can be
20
//! partitioned into segments where each segment already has the
21
//! required information for lexicographic sorting so sorting
22
//! can be done without loading the entire dataset.
23
//!
24
//! Consider a sort plan having an input with ordering `a ASC, b ASC`
25
//!
26
//! ```text
27
//! +---+---+---+
28
//! | a | b | d |
29
//! +---+---+---+
30
//! | 0 | 0 | 3 |
31
//! | 0 | 0 | 2 |
32
//! | 0 | 1 | 1 |
33
//! | 0 | 2 | 0 |
34
//! +---+---+---+
35
//!```
36
//!
37
//! and required ordering for the plan is `a ASC, b ASC, d ASC`.
38
//! The first 3 rows(segment) can be sorted as the segment already
39
//! has the required information for the sort, but the last row
40
//! requires further information as the input can continue with a
41
//! batch with a starting row where a and b does not change as below
42
//!
43
//! ```text
44
//! +---+---+---+
45
//! | a | b | d |
46
//! +---+---+---+
47
//! | 0 | 2 | 4 |
48
//! +---+---+---+
49
//!```
50
//!
51
//! The plan concats incoming data with such last rows of previous input
52
//! and continues partial sorting of the segments.
53
54
use std::any::Any;
55
use std::fmt::Debug;
56
use std::pin::Pin;
57
use std::sync::Arc;
58
use std::task::{Context, Poll};
59
60
use crate::expressions::PhysicalSortExpr;
61
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
62
use crate::sorts::sort::sort_batch;
63
use crate::{
64
    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
65
    Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
66
};
67
68
use arrow::compute::concat_batches;
69
use arrow::datatypes::SchemaRef;
70
use arrow::record_batch::RecordBatch;
71
use datafusion_common::utils::evaluate_partition_ranges;
72
use datafusion_common::Result;
73
use datafusion_execution::{RecordBatchStream, TaskContext};
74
use datafusion_physical_expr::LexOrdering;
75
76
use futures::{ready, Stream, StreamExt};
77
use log::trace;
78
79
/// Partial Sort execution plan.
80
#[derive(Debug, Clone)]
81
pub struct PartialSortExec {
82
    /// Input schema
83
    pub(crate) input: Arc<dyn ExecutionPlan>,
84
    /// Sort expressions
85
    expr: Vec<PhysicalSortExpr>,
86
    /// Length of continuous matching columns of input that satisfy
87
    /// the required ordering for the sort
88
    common_prefix_length: usize,
89
    /// Containing all metrics set created during sort
90
    metrics_set: ExecutionPlanMetricsSet,
91
    /// Preserve partitions of input plan. If false, the input partitions
92
    /// will be sorted and merged into a single output partition.
93
    preserve_partitioning: bool,
94
    /// Fetch highest/lowest n results
95
    fetch: Option<usize>,
96
    /// Cache holding plan properties like equivalences, output partitioning etc.
97
    cache: PlanProperties,
98
}
99
100
impl PartialSortExec {
101
    /// Create a new partial sort execution plan
102
13
    pub fn new(
103
13
        expr: Vec<PhysicalSortExpr>,
104
13
        input: Arc<dyn ExecutionPlan>,
105
13
        common_prefix_length: usize,
106
13
    ) -> Self {
107
13
        debug_assert!(common_prefix_length > 0);
108
13
        let preserve_partitioning = false;
109
13
        let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning);
110
13
        Self {
111
13
            input,
112
13
            expr,
113
13
            common_prefix_length,
114
13
            metrics_set: ExecutionPlanMetricsSet::new(),
115
13
            preserve_partitioning,
116
13
            fetch: None,
117
13
            cache,
118
13
        }
119
13
    }
120
121
    /// Whether this `PartialSortExec` preserves partitioning of the children
122
0
    pub fn preserve_partitioning(&self) -> bool {
123
0
        self.preserve_partitioning
124
0
    }
125
126
    /// Specify the partitioning behavior of this partial sort exec
127
    ///
128
    /// If `preserve_partitioning` is true, sorts each partition
129
    /// individually, producing one sorted stream for each input partition.
130
    ///
131
    /// If `preserve_partitioning` is false, sorts and merges all
132
    /// input partitions producing a single, sorted partition.
133
0
    pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self {
134
0
        self.preserve_partitioning = preserve_partitioning;
135
0
        self.cache = self
136
0
            .cache
137
0
            .with_partitioning(Self::output_partitioning_helper(
138
0
                &self.input,
139
0
                self.preserve_partitioning,
140
0
            ));
141
0
        self
142
0
    }
143
144
    /// Modify how many rows to include in the result
145
    ///
146
    /// If None, then all rows will be returned, in sorted order.
147
    /// If Some, then only the top `fetch` rows will be returned.
148
    /// This can reduce the memory pressure required by the sort
149
    /// operation since rows that are not going to be included
150
    /// can be dropped.
151
6
    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
152
6
        self.fetch = fetch;
153
6
        self
154
6
    }
155
156
    /// Input schema
157
0
    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
158
0
        &self.input
159
0
    }
160
161
    /// Sort expressions
162
0
    pub fn expr(&self) -> &[PhysicalSortExpr] {
163
0
        &self.expr
164
0
    }
165
166
    /// If `Some(fetch)`, limits output to only the first "fetch" items
167
0
    pub fn fetch(&self) -> Option<usize> {
168
0
        self.fetch
169
0
    }
170
171
13
    fn output_partitioning_helper(
172
13
        input: &Arc<dyn ExecutionPlan>,
173
13
        preserve_partitioning: bool,
174
13
    ) -> Partitioning {
175
13
        // Get output partitioning:
176
13
        if preserve_partitioning {
177
0
            input.output_partitioning().clone()
178
        } else {
179
13
            Partitioning::UnknownPartitioning(1)
180
        }
181
13
    }
182
183
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
184
13
    fn compute_properties(
185
13
        input: &Arc<dyn ExecutionPlan>,
186
13
        sort_exprs: LexOrdering,
187
13
        preserve_partitioning: bool,
188
13
    ) -> PlanProperties {
189
13
        // Calculate equivalence properties; i.e. reset the ordering equivalence
190
13
        // class with the new ordering:
191
13
        let eq_properties = input
192
13
            .equivalence_properties()
193
13
            .clone()
194
13
            .with_reorder(sort_exprs);
195
13
196
13
        // Get output partitioning:
197
13
        let output_partitioning =
198
13
            Self::output_partitioning_helper(input, preserve_partitioning);
199
13
200
13
        // Determine execution mode:
201
13
        let mode = input.execution_mode();
202
13
203
13
        PlanProperties::new(eq_properties, output_partitioning, mode)
204
13
    }
205
}
206
207
impl DisplayAs for PartialSortExec {
208
0
    fn fmt_as(
209
0
        &self,
210
0
        t: DisplayFormatType,
211
0
        f: &mut std::fmt::Formatter,
212
0
    ) -> std::fmt::Result {
213
0
        match t {
214
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
215
0
                let expr = PhysicalSortExpr::format_list(&self.expr);
216
0
                let common_prefix_length = self.common_prefix_length;
217
0
                match self.fetch {
218
0
                    Some(fetch) => {
219
0
                        write!(f, "PartialSortExec: TopK(fetch={fetch}), expr=[{expr}], common_prefix_length=[{common_prefix_length}]", )
220
                    }
221
0
                    None => write!(f, "PartialSortExec: expr=[{expr}], common_prefix_length=[{common_prefix_length}]"),
222
                }
223
            }
224
        }
225
0
    }
226
}
227
228
impl ExecutionPlan for PartialSortExec {
229
0
    fn name(&self) -> &'static str {
230
0
        "PartialSortExec"
231
0
    }
232
233
0
    fn as_any(&self) -> &dyn Any {
234
0
        self
235
0
    }
236
237
16
    fn properties(&self) -> &PlanProperties {
238
16
        &self.cache
239
16
    }
240
241
0
    fn fetch(&self) -> Option<usize> {
242
0
        self.fetch
243
0
    }
244
245
0
    fn required_input_distribution(&self) -> Vec<Distribution> {
246
0
        if self.preserve_partitioning {
247
0
            vec![Distribution::UnspecifiedDistribution]
248
        } else {
249
0
            vec![Distribution::SinglePartition]
250
        }
251
0
    }
252
253
0
    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
254
0
        vec![false]
255
0
    }
256
257
0
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
258
0
        vec![&self.input]
259
0
    }
260
261
0
    fn with_new_children(
262
0
        self: Arc<Self>,
263
0
        children: Vec<Arc<dyn ExecutionPlan>>,
264
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
265
0
        let new_partial_sort = PartialSortExec::new(
266
0
            self.expr.clone(),
267
0
            Arc::clone(&children[0]),
268
0
            self.common_prefix_length,
269
0
        )
270
0
        .with_fetch(self.fetch)
271
0
        .with_preserve_partitioning(self.preserve_partitioning);
272
0
273
0
        Ok(Arc::new(new_partial_sort))
274
0
    }
275
276
13
    fn execute(
277
13
        &self,
278
13
        partition: usize,
279
13
        context: Arc<TaskContext>,
280
13
    ) -> Result<SendableRecordBatchStream> {
281
13
        trace!(
"Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()0
);
282
283
13
        let input = self.input.execute(partition, Arc::clone(&context))
?0
;
284
285
13
        trace!(
286
0
            "End PartialSortExec's input.execute for partition: {}",
287
            partition
288
        );
289
290
        // Make sure common prefix length is larger than 0
291
        // Otherwise, we should use SortExec.
292
13
        debug_assert!(self.common_prefix_length > 0);
293
294
13
        Ok(Box::pin(PartialSortStream {
295
13
            input,
296
13
            expr: self.expr.clone(),
297
13
            common_prefix_length: self.common_prefix_length,
298
13
            in_mem_batches: vec![],
299
13
            fetch: self.fetch,
300
13
            is_closed: false,
301
13
            baseline_metrics: BaselineMetrics::new(&self.metrics_set, partition),
302
13
        }))
303
13
    }
304
305
1
    fn metrics(&self) -> Option<MetricsSet> {
306
1
        Some(self.metrics_set.clone_inner())
307
1
    }
308
309
0
    fn statistics(&self) -> Result<Statistics> {
310
0
        self.input.statistics()
311
0
    }
312
}
313
314
struct PartialSortStream {
315
    /// The input plan
316
    input: SendableRecordBatchStream,
317
    /// Sort expressions
318
    expr: Vec<PhysicalSortExpr>,
319
    /// Length of prefix common to input ordering and required ordering of plan
320
    /// should be more than 0 otherwise PartialSort is not applicable
321
    common_prefix_length: usize,
322
    /// Used as a buffer for part of the input not ready for sort
323
    in_mem_batches: Vec<RecordBatch>,
324
    /// Fetch top N results
325
    fetch: Option<usize>,
326
    /// Whether the stream has finished returning all of its data or not
327
    is_closed: bool,
328
    /// Execution metrics
329
    baseline_metrics: BaselineMetrics,
330
}
331
332
impl Stream for PartialSortStream {
333
    type Item = Result<RecordBatch>;
334
335
42
    fn poll_next(
336
42
        mut self: Pin<&mut Self>,
337
42
        cx: &mut Context<'_>,
338
42
    ) -> Poll<Option<Self::Item>> {
339
42
        let poll = self.poll_next_inner(cx);
340
42
        self.baseline_metrics.record_poll(poll)
341
42
    }
342
343
0
    fn size_hint(&self) -> (usize, Option<usize>) {
344
0
        // we can't predict the size of incoming batches so re-use the size hint from the input
345
0
        self.input.size_hint()
346
0
    }
347
}
348
349
impl RecordBatchStream for PartialSortStream {
350
29
    fn schema(&self) -> SchemaRef {
351
29
        self.input.schema()
352
29
    }
353
}
354
355
impl PartialSortStream {
356
42
    fn poll_next_inner(
357
42
        self: &mut Pin<&mut Self>,
358
42
        cx: &mut Context<'_>,
359
42
    ) -> Poll<Option<Result<RecordBatch>>> {
360
42
        if self.is_closed {
361
12
            return Poll::Ready(None);
362
30
        }
363
30
        let 
result29
= match
ready!1
(self.input.poll_next_unpin(cx)) {
364
21
            Some(Ok(batch)) => {
365
15
                if let Some(slice_point) =
366
21
                    self.get_slice_point(self.common_prefix_length, &batch)
?0
367
                {
368
15
                    self.in_mem_batches.push(batch.slice(0, slice_point));
369
15
                    let remaining_batch =
370
15
                        batch.slice(slice_point, batch.num_rows() - slice_point);
371
15
                    let sorted_batch = self.sort_in_mem_batches();
372
15
                    self.in_mem_batches.push(remaining_batch);
373
15
                    sorted_batch
374
                } else {
375
6
                    self.in_mem_batches.push(batch);
376
6
                    Ok(RecordBatch::new_empty(self.schema()))
377
                }
378
            }
379
0
            Some(Err(e)) => Err(e),
380
            None => {
381
8
                self.is_closed = true;
382
8
                // once input is consumed, sort the rest of the inserted batches
383
8
                self.sort_in_mem_batches()
384
            }
385
        };
386
387
29
        Poll::Ready(Some(result))
388
42
    }
389
390
    /// Returns a sorted RecordBatch from in_mem_batches and clears in_mem_batches
391
    ///
392
    /// If fetch is specified for PartialSortStream `sort_in_mem_batches` will limit
393
    /// the last RecordBatch returned and will mark the stream as closed
394
23
    fn sort_in_mem_batches(self: &mut Pin<&mut Self>) -> Result<RecordBatch> {
395
23
        let input_batch = concat_batches(&self.schema(), &self.in_mem_batches)
?0
;
396
23
        self.in_mem_batches.clear();
397
23
        let result = sort_batch(&input_batch, &self.expr, self.fetch)
?0
;
398
23
        if let Some(
remaining_fetch10
) = self.fetch {
399
            // remaining_fetch - result.num_rows() is always be >= 0
400
            // because result length of sort_batch with limit cannot be
401
            // more than the requested limit
402
10
            self.fetch = Some(remaining_fetch - result.num_rows());
403
10
            if remaining_fetch == result.num_rows() {
404
6
                self.is_closed = true;
405
6
            }
4
406
13
        }
407
23
        Ok(result)
408
23
    }
409
410
    /// Return the end index of the second last partition if the batch
411
    /// can be partitioned based on its already sorted columns
412
    ///
413
    /// Return None if the batch cannot be partitioned, which means the
414
    /// batch does not have the information for a safe sort
415
21
    fn get_slice_point(
416
21
        &self,
417
21
        common_prefix_len: usize,
418
21
        batch: &RecordBatch,
419
21
    ) -> Result<Option<usize>> {
420
21
        let common_prefix_sort_keys = (0..common_prefix_len)
421
25
            .map(|idx| self.expr[idx].evaluate_to_sort_column(batch))
422
21
            .collect::<Result<Vec<_>>>()
?0
;
423
21
        let partition_points =
424
21
            evaluate_partition_ranges(batch.num_rows(), &common_prefix_sort_keys)
?0
;
425
        // If partition points are [0..100], [100..200], [200..300]
426
        // we should return 200, which is the safest and furthest partition boundary
427
        // Please note that we shouldn't return 300 (which is number of rows in the batch),
428
        // because this boundary may change with new data.
429
21
        if partition_points.len() >= 2 {
430
15
            Ok(Some(partition_points[partition_points.len() - 2].end))
431
        } else {
432
6
            Ok(None)
433
        }
434
21
    }
435
}
436
437
#[cfg(test)]
438
mod tests {
439
    use std::collections::HashMap;
440
441
    use arrow::array::*;
442
    use arrow::compute::SortOptions;
443
    use arrow::datatypes::*;
444
    use futures::FutureExt;
445
    use itertools::Itertools;
446
447
    use datafusion_common::assert_batches_eq;
448
449
    use crate::collect;
450
    use crate::expressions::col;
451
    use crate::memory::MemoryExec;
452
    use crate::sorts::sort::SortExec;
453
    use crate::test;
454
    use crate::test::assert_is_pending;
455
    use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec};
456
457
    use super::*;
458
459
    #[tokio::test]
460
1
    async fn test_partial_sort() -> Result<()> {
461
1
        let task_ctx = Arc::new(TaskContext::default());
462
1
        let source = test::build_table_scan_i32(
463
1
            ("a", &vec![0, 0, 0, 1, 1, 1]),
464
1
            ("b", &vec![1, 1, 2, 2, 3, 3]),
465
1
            ("c", &vec![1, 0, 5, 4, 3, 2]),
466
1
        );
467
1
        let schema = Schema::new(vec![
468
1
            Field::new("a", DataType::Int32, false),
469
1
            Field::new("b", DataType::Int32, false),
470
1
            Field::new("c", DataType::Int32, false),
471
1
        ]);
472
1
        let option_asc = SortOptions {
473
1
            descending: false,
474
1
            nulls_first: false,
475
1
        };
476
1
477
1
        let partial_sort_exec = Arc::new(PartialSortExec::new(
478
1
            vec![
479
1
                PhysicalSortExpr {
480
1
                    expr: col("a", &schema)
?0
,
481
1
                    options: option_asc,
482
1
                },
483
1
                PhysicalSortExpr {
484
1
                    expr: col("b", &schema)
?0
,
485
1
                    options: option_asc,
486
1
                },
487
1
                PhysicalSortExpr {
488
1
                    expr: col("c", &schema)
?0
,
489
1
                    options: option_asc,
490
1
                },
491
1
            ],
492
1
            Arc::clone(&source),
493
1
            2,
494
1
        )) as Arc<dyn ExecutionPlan>;
495
1
496
1
        let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
497
1
498
1
        let expected_after_sort = [
499
1
            "+---+---+---+",
500
1
            "| a | b | c |",
501
1
            "+---+---+---+",
502
1
            "| 0 | 1 | 0 |",
503
1
            "| 0 | 1 | 1 |",
504
1
            "| 0 | 2 | 5 |",
505
1
            "| 1 | 2 | 4 |",
506
1
            "| 1 | 3 | 2 |",
507
1
            "| 1 | 3 | 3 |",
508
1
            "+---+---+---+",
509
1
        ];
510
1
        assert_eq!(2, result.len());
511
1
        assert_batches_eq!(expected_after_sort, &result);
512
1
        assert_eq!(
513
1
            task_ctx.runtime_env().memory_pool.reserved(),
514
1
            0,
515
1
            
"The sort should have returned all memory used back to the memory manager"0
516
1
        );
517
1
518
1
        Ok(())
519
1
    }
520
521
    #[tokio::test]
522
1
    async fn test_partial_sort_with_fetch() -> Result<()> {
523
1
        let task_ctx = Arc::new(TaskContext::default());
524
1
        let source = test::build_table_scan_i32(
525
1
            ("a", &vec![0, 0, 1, 1, 1]),
526
1
            ("b", &vec![1, 2, 2, 3, 3]),
527
1
            ("c", &vec![4, 3, 2, 1, 0]),
528
1
        );
529
1
        let schema = Schema::new(vec![
530
1
            Field::new("a", DataType::Int32, false),
531
1
            Field::new("b", DataType::Int32, false),
532
1
            Field::new("c", DataType::Int32, false),
533
1
        ]);
534
1
        let option_asc = SortOptions {
535
1
            descending: false,
536
1
            nulls_first: false,
537
1
        };
538
1
539
3
        for 
common_prefix_length2
in [1, 2] {
540
2
            let partial_sort_exec = Arc::new(
541
1
                PartialSortExec::new(
542
2
                    vec![
543
2
                        PhysicalSortExpr {
544
2
                            expr: col("a", &schema)
?0
,
545
2
                            options: option_asc,
546
2
                        },
547
2
                        PhysicalSortExpr {
548
2
                            expr: col("b", &schema)
?0
,
549
2
                            options: option_asc,
550
2
                        },
551
2
                        PhysicalSortExpr {
552
2
                            expr: col("c", &schema)
?0
,
553
2
                            options: option_asc,
554
2
                        },
555
2
                    ],
556
2
                    Arc::clone(&source),
557
2
                    common_prefix_length,
558
2
                )
559
2
                .with_fetch(Some(4)),
560
1
            ) as Arc<dyn ExecutionPlan>;
561
1
562
2
            let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
563
1
564
2
            let expected_after_sort = [
565
2
                "+---+---+---+",
566
2
                "| a | b | c |",
567
2
                "+---+---+---+",
568
2
                "| 0 | 1 | 4 |",
569
2
                "| 0 | 2 | 3 |",
570
2
                "| 1 | 2 | 2 |",
571
2
                "| 1 | 3 | 0 |",
572
2
                "+---+---+---+",
573
2
            ];
574
2
            assert_eq!(2, result.len());
575
2
            assert_batches_eq!(expected_after_sort, &result);
576
2
            assert_eq!(
577
2
                task_ctx.runtime_env().memory_pool.reserved(),
578
1
                0,
579
1
                
"The sort should have returned all memory used back to the memory manager"0
580
1
            );
581
1
        }
582
1
583
1
        Ok(())
584
1
    }
585
586
    #[tokio::test]
587
1
    async fn test_partial_sort2() -> Result<()> {
588
1
        let task_ctx = Arc::new(TaskContext::default());
589
1
        let source_tables = [
590
1
            test::build_table_scan_i32(
591
1
                ("a", &vec![0, 0, 0, 0, 1, 1, 1, 1]),
592
1
                ("b", &vec![1, 1, 3, 3, 4, 4, 2, 2]),
593
1
                ("c", &vec![7, 6, 5, 4, 3, 2, 1, 0]),
594
1
            ),
595
1
            test::build_table_scan_i32(
596
1
                ("a", &vec![0, 0, 0, 0, 1, 1, 1, 1]),
597
1
                ("b", &vec![1, 1, 3, 3, 2, 2, 4, 4]),
598
1
                ("c", &vec![7, 6, 5, 4, 1, 0, 3, 2]),
599
1
            ),
600
1
        ];
601
1
        let schema = Schema::new(vec![
602
1
            Field::new("a", DataType::Int32, false),
603
1
            Field::new("b", DataType::Int32, false),
604
1
            Field::new("c", DataType::Int32, false),
605
1
        ]);
606
1
        let option_asc = SortOptions {
607
1
            descending: false,
608
1
            nulls_first: false,
609
1
        };
610
2
        for (common_prefix_length, source) in
611
1
            [(1, &source_tables[0]), (2, &source_tables[1])]
612
1
        {
613
2
            let partial_sort_exec = Arc::new(PartialSortExec::new(
614
2
                vec![
615
2
                    PhysicalSortExpr {
616
2
                        expr: col("a", &schema)
?0
,
617
2
                        options: option_asc,
618
2
                    },
619
2
                    PhysicalSortExpr {
620
2
                        expr: col("b", &schema)
?0
,
621
2
                        options: option_asc,
622
2
                    },
623
2
                    PhysicalSortExpr {
624
2
                        expr: col("c", &schema)
?0
,
625
2
                        options: option_asc,
626
2
                    },
627
2
                ],
628
2
                Arc::clone(source),
629
2
                common_prefix_length,
630
1
            ));
631
1
632
2
            let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
633
2
            assert_eq!(2, result.len());
634
2
            assert_eq!(
635
2
                task_ctx.runtime_env().memory_pool.reserved(),
636
1
                0,
637
1
                
"The sort should have returned all memory used back to the memory manager"0
638
1
            );
639
2
            let expected = [
640
2
                "+---+---+---+",
641
2
                "| a | b | c |",
642
2
                "+---+---+---+",
643
2
                "| 0 | 1 | 6 |",
644
2
                "| 0 | 1 | 7 |",
645
2
                "| 0 | 3 | 4 |",
646
2
                "| 0 | 3 | 5 |",
647
2
                "| 1 | 2 | 0 |",
648
2
                "| 1 | 2 | 1 |",
649
2
                "| 1 | 4 | 2 |",
650
2
                "| 1 | 4 | 3 |",
651
2
                "+---+---+---+",
652
2
            ];
653
2
            assert_batches_eq!(expected, &result);
654
1
        }
655
1
        Ok(())
656
1
    }
657
658
2
    fn prepare_partitioned_input() -> Arc<dyn ExecutionPlan> {
659
2
        let batch1 = test::build_table_i32(
660
2
            ("a", &vec![1; 100]),
661
2
            ("b", &(0..100).rev().collect()),
662
2
            ("c", &(0..100).rev().collect()),
663
2
        );
664
2
        let batch2 = test::build_table_i32(
665
2
            ("a", &[&vec![1; 25][..], &vec![2; 75][..]].concat()),
666
2
            ("b", &(100..200).rev().collect()),
667
2
            ("c", &(0..100).collect()),
668
2
        );
669
2
        let batch3 = test::build_table_i32(
670
2
            ("a", &[&vec![3; 50][..], &vec![4; 50][..]].concat()),
671
2
            ("b", &(150..250).rev().collect()),
672
2
            ("c", &(0..100).rev().collect()),
673
2
        );
674
2
        let batch4 = test::build_table_i32(
675
2
            ("a", &vec![4; 100]),
676
2
            ("b", &(50..150).rev().collect()),
677
2
            ("c", &(0..100).rev().collect()),
678
2
        );
679
2
        let schema = batch1.schema();
680
2
        Arc::new(
681
2
            MemoryExec::try_new(
682
2
                &[vec![batch1, batch2, batch3, batch4]],
683
2
                Arc::clone(&schema),
684
2
                None,
685
2
            )
686
2
            .unwrap(),
687
2
        ) as Arc<dyn ExecutionPlan>
688
2
    }
689
690
    #[tokio::test]
691
1
    async fn test_partitioned_input_partial_sort() -> Result<()> {
692
1
        let task_ctx = Arc::new(TaskContext::default());
693
1
        let mem_exec = prepare_partitioned_input();
694
1
        let option_asc = SortOptions {
695
1
            descending: false,
696
1
            nulls_first: false,
697
1
        };
698
1
        let option_desc = SortOptions {
699
1
            descending: false,
700
1
            nulls_first: false,
701
1
        };
702
1
        let schema = mem_exec.schema();
703
1
        let partial_sort_executor = PartialSortExec::new(
704
1
            vec![
705
1
                PhysicalSortExpr {
706
1
                    expr: col("a", &schema)
?0
,
707
1
                    options: option_asc,
708
1
                },
709
1
                PhysicalSortExpr {
710
1
                    expr: col("b", &schema)
?0
,
711
1
                    options: option_desc,
712
1
                },
713
1
                PhysicalSortExpr {
714
1
                    expr: col("c", &schema)
?0
,
715
1
                    options: option_asc,
716
1
                },
717
1
            ],
718
1
            Arc::clone(&mem_exec),
719
1
            1,
720
1
        );
721
1
        let partial_sort_exec =
722
1
            Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>;
723
1
        let sort_exec = Arc::new(SortExec::new(
724
1
            partial_sort_executor.expr,
725
1
            partial_sort_executor.input,
726
1
        )) as Arc<dyn ExecutionPlan>;
727
1
        let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
728
1
        assert_eq!(
729
5
            result.iter().map(|r| r.num_rows()).collect_vec(),
730
1
            [0, 125, 125, 0, 150]
731
1
        );
732
1
733
1
        assert_eq!(
734
1
            task_ctx.runtime_env().memory_pool.reserved(),
735
1
            0,
736
1
            
"The sort should have returned all memory used back to the memory manager"0
737
1
        );
738
1
        let partial_sort_result = concat_batches(&schema, &result).unwrap();
739
1
        let sort_result = collect(sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
740
1
        assert_eq!(sort_result[0], partial_sort_result);
741
1
742
1
        Ok(())
743
1
    }
744
745
    #[tokio::test]
746
1
    async fn test_partitioned_input_partial_sort_with_fetch() -> Result<()> {
747
1
        let task_ctx = Arc::new(TaskContext::default());
748
1
        let mem_exec = prepare_partitioned_input();
749
1
        let schema = mem_exec.schema();
750
1
        let option_asc = SortOptions {
751
1
            descending: false,
752
1
            nulls_first: false,
753
1
        };
754
1
        let option_desc = SortOptions {
755
1
            descending: false,
756
1
            nulls_first: false,
757
1
        };
758
4
        for (fetch_size, expected_batch_num_rows) in [
759
1
            (Some(50), vec![0, 50]),
760
1
            (Some(120), vec![0, 120]),
761
1
            (Some(150), vec![0, 125, 25]),
762
1
            (Some(250), vec![0, 125, 125]),
763
1
        ] {
764
4
            let partial_sort_executor = PartialSortExec::new(
765
4
                vec![
766
4
                    PhysicalSortExpr {
767
4
                        expr: col("a", &schema)
?0
,
768
4
                        options: option_asc,
769
4
                    },
770
4
                    PhysicalSortExpr {
771
4
                        expr: col("b", &schema)
?0
,
772
4
                        options: option_desc,
773
4
                    },
774
4
                    PhysicalSortExpr {
775
4
                        expr: col("c", &schema)
?0
,
776
4
                        options: option_asc,
777
4
                    },
778
4
                ],
779
4
                Arc::clone(&mem_exec),
780
4
                1,
781
4
            )
782
4
            .with_fetch(fetch_size);
783
4
784
4
            let partial_sort_exec =
785
4
                Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>;
786
4
            let sort_exec = Arc::new(
787
4
                SortExec::new(partial_sort_executor.expr, partial_sort_executor.input)
788
4
                    .with_fetch(fetch_size),
789
4
            ) as Arc<dyn ExecutionPlan>;
790
4
            let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
791
4
            assert_eq!(
792
10
                result.iter().map(|r| r.num_rows()).collect_vec(),
793
4
                expected_batch_num_rows
794
4
            );
795
1
796
4
            assert_eq!(
797
4
                task_ctx.runtime_env().memory_pool.reserved(),
798
1
                0,
799
1
                
"The sort should have returned all memory used back to the memory manager"0
800
1
            );
801
4
            let partial_sort_result = concat_batches(&schema, &result)
?0
;
802
4
            let sort_result = collect(sort_exec, Arc::clone(&task_ctx)).
await0
?0
;
803
4
            assert_eq!(sort_result[0], partial_sort_result);
804
1
        }
805
1
806
1
        Ok(())
807
1
    }
808
809
    #[tokio::test]
810
1
    async fn test_sort_metadata() -> Result<()> {
811
1
        let task_ctx = Arc::new(TaskContext::default());
812
1
        let field_metadata: HashMap<String, String> =
813
1
            vec![("foo".to_string(), "bar".to_string())]
814
1
                .into_iter()
815
1
                .collect();
816
1
        let schema_metadata: HashMap<String, String> =
817
1
            vec![("baz".to_string(), "barf".to_string())]
818
1
                .into_iter()
819
1
                .collect();
820
1
821
1
        let mut field = Field::new("field_name", DataType::UInt64, true);
822
1
        field.set_metadata(field_metadata.clone());
823
1
        let schema = Schema::new_with_metadata(vec![field], schema_metadata.clone());
824
1
        let schema = Arc::new(schema);
825
1
826
1
        let data: ArrayRef =
827
1
            Arc::new(vec![1, 1, 2].into_iter().map(Some).collect::<UInt64Array>());
828
1
829
1
        let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])
?0
;
830
1
        let input = Arc::new(MemoryExec::try_new(
831
1
            &[vec![batch]],
832
1
            Arc::clone(&schema),
833
1
            None,
834
1
        )
?0
);
835
1
836
1
        let partial_sort_exec = Arc::new(PartialSortExec::new(
837
1
            vec![PhysicalSortExpr {
838
1
                expr: col("field_name", &schema)
?0
,
839
1
                options: SortOptions::default(),
840
1
            }],
841
1
            input,
842
1
            1,
843
1
        ));
844
1
845
1
        let result: Vec<RecordBatch> = collect(partial_sort_exec, task_ctx).
await0
?0
;
846
1
        let expected_batch = vec![
847
1
            RecordBatch::try_new(
848
1
                Arc::clone(&schema),
849
1
                vec![Arc::new(
850
1
                    vec![1, 1].into_iter().map(Some).collect::<UInt64Array>(),
851
1
                )],
852
1
            )
?0
,
853
1
            RecordBatch::try_new(
854
1
                Arc::clone(&schema),
855
1
                vec![Arc::new(
856
1
                    vec![2].into_iter().map(Some).collect::<UInt64Array>(),
857
1
                )],
858
1
            )
?0
,
859
1
        ];
860
1
861
1
        // Data is correct
862
1
        assert_eq!(&expected_batch, &result);
863
1
864
1
        // explicitly ensure the metadata is present
865
1
        assert_eq!(result[0].schema().fields()[0].metadata(), &field_metadata);
866
1
        assert_eq!(result[0].schema().metadata(), &schema_metadata);
867
1
868
1
        Ok(())
869
1
    }
870
871
    #[tokio::test]
872
1
    async fn test_lex_sort_by_float() -> Result<()> {
873
1
        let task_ctx = Arc::new(TaskContext::default());
874
1
        let schema = Arc::new(Schema::new(vec![
875
1
            Field::new("a", DataType::Float32, true),
876
1
            Field::new("b", DataType::Float64, true),
877
1
            Field::new("c", DataType::Float64, true),
878
1
        ]));
879
1
        let option_asc = SortOptions {
880
1
            descending: false,
881
1
            nulls_first: true,
882
1
        };
883
1
        let option_desc = SortOptions {
884
1
            descending: true,
885
1
            nulls_first: true,
886
1
        };
887
1
888
1
        // define data.
889
1
        let batch = RecordBatch::try_new(
890
1
            Arc::clone(&schema),
891
1
            vec![
892
1
                Arc::new(Float32Array::from(vec![
893
1
                    Some(1.0_f32),
894
1
                    Some(1.0_f32),
895
1
                    Some(1.0_f32),
896
1
                    Some(2.0_f32),
897
1
                    Some(2.0_f32),
898
1
                    Some(3.0_f32),
899
1
                    Some(3.0_f32),
900
1
                    Some(3.0_f32),
901
1
                ])),
902
1
                Arc::new(Float64Array::from(vec![
903
1
                    Some(20.0_f64),
904
1
                    Some(20.0_f64),
905
1
                    Some(40.0_f64),
906
1
                    Some(40.0_f64),
907
1
                    Some(f64::NAN),
908
1
                    None,
909
1
                    None,
910
1
                    Some(f64::NAN),
911
1
                ])),
912
1
                Arc::new(Float64Array::from(vec![
913
1
                    Some(10.0_f64),
914
1
                    Some(20.0_f64),
915
1
                    Some(10.0_f64),
916
1
                    Some(100.0_f64),
917
1
                    Some(f64::NAN),
918
1
                    Some(100.0_f64),
919
1
                    None,
920
1
                    Some(f64::NAN),
921
1
                ])),
922
1
            ],
923
1
        )
?0
;
924
1
925
1
        let partial_sort_exec = Arc::new(PartialSortExec::new(
926
1
            vec![
927
1
                PhysicalSortExpr {
928
1
                    expr: col("a", &schema)
?0
,
929
1
                    options: option_asc,
930
1
                },
931
1
                PhysicalSortExpr {
932
1
                    expr: col("b", &schema)
?0
,
933
1
                    options: option_asc,
934
1
                },
935
1
                PhysicalSortExpr {
936
1
                    expr: col("c", &schema)
?0
,
937
1
                    options: option_desc,
938
1
                },
939
1
            ],
940
1
            Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)
?0
),
941
1
            2,
942
1
        ));
943
1
944
1
        let expected = [
945
1
            "+-----+------+-------+",
946
1
            "| a   | b    | c     |",
947
1
            "+-----+------+-------+",
948
1
            "| 1.0 | 20.0 | 20.0  |",
949
1
            "| 1.0 | 20.0 | 10.0  |",
950
1
            "| 1.0 | 40.0 | 10.0  |",
951
1
            "| 2.0 | 40.0 | 100.0 |",
952
1
            "| 2.0 | NaN  | NaN   |",
953
1
            "| 3.0 |      |       |",
954
1
            "| 3.0 |      | 100.0 |",
955
1
            "| 3.0 | NaN  | NaN   |",
956
1
            "+-----+------+-------+",
957
1
        ];
958
1
959
1
        assert_eq!(
960
1
            DataType::Float32,
961
1
            *partial_sort_exec.schema().field(0).data_type()
962
1
        );
963
1
        assert_eq!(
964
1
            DataType::Float64,
965
1
            *partial_sort_exec.schema().field(1).data_type()
966
1
        );
967
1
        assert_eq!(
968
1
            DataType::Float64,
969
1
            *partial_sort_exec.schema().field(2).data_type()
970
1
        );
971
1
972
1
        let result: Vec<RecordBatch> = collect(
973
1
            Arc::clone(&partial_sort_exec) as Arc<dyn ExecutionPlan>,
974
1
            task_ctx,
975
1
        )
976
1
        .
await0
?0
;
977
1
        assert_batches_eq!(expected, &result);
978
1
        assert_eq!(result.len(), 2);
979
1
        let metrics = partial_sort_exec.metrics().unwrap();
980
1
        assert!(metrics.elapsed_compute().unwrap() > 0);
981
1
        assert_eq!(metrics.output_rows().unwrap(), 8);
982
1
983
1
        let columns = result[0].columns();
984
1
985
1
        assert_eq!(DataType::Float32, *columns[0].data_type());
986
1
        assert_eq!(DataType::Float64, *columns[1].data_type());
987
1
        assert_eq!(DataType::Float64, *columns[2].data_type());
988
1
989
1
        Ok(())
990
1
    }
991
992
    #[tokio::test]
993
1
    async fn test_drop_cancel() -> Result<()> {
994
1
        let task_ctx = Arc::new(TaskContext::default());
995
1
        let schema = Arc::new(Schema::new(vec![
996
1
            Field::new("a", DataType::Float32, true),
997
1
            Field::new("b", DataType::Float32, true),
998
1
        ]));
999
1
1000
1
        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1));
1001
1
        let refs = blocking_exec.refs();
1002
1
        let sort_exec = Arc::new(PartialSortExec::new(
1003
1
            vec![PhysicalSortExpr {
1004
1
                expr: col("a", &schema)
?0
,
1005
1
                options: SortOptions::default(),
1006
1
            }],
1007
1
            blocking_exec,
1008
1
            1,
1009
1
        ));
1010
1
1011
1
        let fut = collect(sort_exec, Arc::clone(&task_ctx));
1012
1
        let mut fut = fut.boxed();
1013
1
1014
1
        assert_is_pending(&mut fut);
1015
1
        drop(fut);
1016
1
        assert_strong_count_converges_to_zero(refs).
await0
;
1017
1
1018
1
        assert_eq!(
1019
1
            task_ctx.runtime_env().memory_pool.reserved(),
1020
1
            0,
1021
1
            
"The sort should have returned all memory used back to the memory manager"0
1022
1
        );
1023
1
1024
1
        Ok(())
1025
1
    }
1026
}