/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/sorts/partial_sort.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Partial Sort deals with input data that partially |
19 | | //! satisfies the required sort order. Such an input data can be |
20 | | //! partitioned into segments where each segment already has the |
21 | | //! required information for lexicographic sorting so sorting |
22 | | //! can be done without loading the entire dataset. |
23 | | //! |
24 | | //! Consider a sort plan having an input with ordering `a ASC, b ASC` |
25 | | //! |
26 | | //! ```text |
27 | | //! +---+---+---+ |
28 | | //! | a | b | d | |
29 | | //! +---+---+---+ |
30 | | //! | 0 | 0 | 3 | |
31 | | //! | 0 | 0 | 2 | |
32 | | //! | 0 | 1 | 1 | |
33 | | //! | 0 | 2 | 0 | |
34 | | //! +---+---+---+ |
35 | | //!``` |
36 | | //! |
37 | | //! and required ordering for the plan is `a ASC, b ASC, d ASC`. |
38 | | //! The first 3 rows(segment) can be sorted as the segment already |
39 | | //! has the required information for the sort, but the last row |
40 | | //! requires further information as the input can continue with a |
41 | | //! batch with a starting row where a and b does not change as below |
42 | | //! |
43 | | //! ```text |
44 | | //! +---+---+---+ |
45 | | //! | a | b | d | |
46 | | //! +---+---+---+ |
47 | | //! | 0 | 2 | 4 | |
48 | | //! +---+---+---+ |
49 | | //!``` |
50 | | //! |
51 | | //! The plan concats incoming data with such last rows of previous input |
52 | | //! and continues partial sorting of the segments. |
53 | | |
54 | | use std::any::Any; |
55 | | use std::fmt::Debug; |
56 | | use std::pin::Pin; |
57 | | use std::sync::Arc; |
58 | | use std::task::{Context, Poll}; |
59 | | |
60 | | use crate::expressions::PhysicalSortExpr; |
61 | | use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; |
62 | | use crate::sorts::sort::sort_batch; |
63 | | use crate::{ |
64 | | DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, |
65 | | Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, |
66 | | }; |
67 | | |
68 | | use arrow::compute::concat_batches; |
69 | | use arrow::datatypes::SchemaRef; |
70 | | use arrow::record_batch::RecordBatch; |
71 | | use datafusion_common::utils::evaluate_partition_ranges; |
72 | | use datafusion_common::Result; |
73 | | use datafusion_execution::{RecordBatchStream, TaskContext}; |
74 | | use datafusion_physical_expr::LexOrdering; |
75 | | |
76 | | use futures::{ready, Stream, StreamExt}; |
77 | | use log::trace; |
78 | | |
79 | | /// Partial Sort execution plan. |
80 | | #[derive(Debug, Clone)] |
81 | | pub struct PartialSortExec { |
82 | | /// Input schema |
83 | | pub(crate) input: Arc<dyn ExecutionPlan>, |
84 | | /// Sort expressions |
85 | | expr: Vec<PhysicalSortExpr>, |
86 | | /// Length of continuous matching columns of input that satisfy |
87 | | /// the required ordering for the sort |
88 | | common_prefix_length: usize, |
89 | | /// Containing all metrics set created during sort |
90 | | metrics_set: ExecutionPlanMetricsSet, |
91 | | /// Preserve partitions of input plan. If false, the input partitions |
92 | | /// will be sorted and merged into a single output partition. |
93 | | preserve_partitioning: bool, |
94 | | /// Fetch highest/lowest n results |
95 | | fetch: Option<usize>, |
96 | | /// Cache holding plan properties like equivalences, output partitioning etc. |
97 | | cache: PlanProperties, |
98 | | } |
99 | | |
100 | | impl PartialSortExec { |
101 | | /// Create a new partial sort execution plan |
102 | 13 | pub fn new( |
103 | 13 | expr: Vec<PhysicalSortExpr>, |
104 | 13 | input: Arc<dyn ExecutionPlan>, |
105 | 13 | common_prefix_length: usize, |
106 | 13 | ) -> Self { |
107 | 13 | debug_assert!(common_prefix_length > 0); |
108 | 13 | let preserve_partitioning = false; |
109 | 13 | let cache = Self::compute_properties(&input, expr.clone(), preserve_partitioning); |
110 | 13 | Self { |
111 | 13 | input, |
112 | 13 | expr, |
113 | 13 | common_prefix_length, |
114 | 13 | metrics_set: ExecutionPlanMetricsSet::new(), |
115 | 13 | preserve_partitioning, |
116 | 13 | fetch: None, |
117 | 13 | cache, |
118 | 13 | } |
119 | 13 | } |
120 | | |
121 | | /// Whether this `PartialSortExec` preserves partitioning of the children |
122 | 0 | pub fn preserve_partitioning(&self) -> bool { |
123 | 0 | self.preserve_partitioning |
124 | 0 | } |
125 | | |
126 | | /// Specify the partitioning behavior of this partial sort exec |
127 | | /// |
128 | | /// If `preserve_partitioning` is true, sorts each partition |
129 | | /// individually, producing one sorted stream for each input partition. |
130 | | /// |
131 | | /// If `preserve_partitioning` is false, sorts and merges all |
132 | | /// input partitions producing a single, sorted partition. |
133 | 0 | pub fn with_preserve_partitioning(mut self, preserve_partitioning: bool) -> Self { |
134 | 0 | self.preserve_partitioning = preserve_partitioning; |
135 | 0 | self.cache = self |
136 | 0 | .cache |
137 | 0 | .with_partitioning(Self::output_partitioning_helper( |
138 | 0 | &self.input, |
139 | 0 | self.preserve_partitioning, |
140 | 0 | )); |
141 | 0 | self |
142 | 0 | } |
143 | | |
144 | | /// Modify how many rows to include in the result |
145 | | /// |
146 | | /// If None, then all rows will be returned, in sorted order. |
147 | | /// If Some, then only the top `fetch` rows will be returned. |
148 | | /// This can reduce the memory pressure required by the sort |
149 | | /// operation since rows that are not going to be included |
150 | | /// can be dropped. |
151 | 6 | pub fn with_fetch(mut self, fetch: Option<usize>) -> Self { |
152 | 6 | self.fetch = fetch; |
153 | 6 | self |
154 | 6 | } |
155 | | |
156 | | /// Input schema |
157 | 0 | pub fn input(&self) -> &Arc<dyn ExecutionPlan> { |
158 | 0 | &self.input |
159 | 0 | } |
160 | | |
161 | | /// Sort expressions |
162 | 0 | pub fn expr(&self) -> &[PhysicalSortExpr] { |
163 | 0 | &self.expr |
164 | 0 | } |
165 | | |
166 | | /// If `Some(fetch)`, limits output to only the first "fetch" items |
167 | 0 | pub fn fetch(&self) -> Option<usize> { |
168 | 0 | self.fetch |
169 | 0 | } |
170 | | |
171 | 13 | fn output_partitioning_helper( |
172 | 13 | input: &Arc<dyn ExecutionPlan>, |
173 | 13 | preserve_partitioning: bool, |
174 | 13 | ) -> Partitioning { |
175 | 13 | // Get output partitioning: |
176 | 13 | if preserve_partitioning { |
177 | 0 | input.output_partitioning().clone() |
178 | | } else { |
179 | 13 | Partitioning::UnknownPartitioning(1) |
180 | | } |
181 | 13 | } |
182 | | |
183 | | /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc. |
184 | 13 | fn compute_properties( |
185 | 13 | input: &Arc<dyn ExecutionPlan>, |
186 | 13 | sort_exprs: LexOrdering, |
187 | 13 | preserve_partitioning: bool, |
188 | 13 | ) -> PlanProperties { |
189 | 13 | // Calculate equivalence properties; i.e. reset the ordering equivalence |
190 | 13 | // class with the new ordering: |
191 | 13 | let eq_properties = input |
192 | 13 | .equivalence_properties() |
193 | 13 | .clone() |
194 | 13 | .with_reorder(sort_exprs); |
195 | 13 | |
196 | 13 | // Get output partitioning: |
197 | 13 | let output_partitioning = |
198 | 13 | Self::output_partitioning_helper(input, preserve_partitioning); |
199 | 13 | |
200 | 13 | // Determine execution mode: |
201 | 13 | let mode = input.execution_mode(); |
202 | 13 | |
203 | 13 | PlanProperties::new(eq_properties, output_partitioning, mode) |
204 | 13 | } |
205 | | } |
206 | | |
207 | | impl DisplayAs for PartialSortExec { |
208 | 0 | fn fmt_as( |
209 | 0 | &self, |
210 | 0 | t: DisplayFormatType, |
211 | 0 | f: &mut std::fmt::Formatter, |
212 | 0 | ) -> std::fmt::Result { |
213 | 0 | match t { |
214 | | DisplayFormatType::Default | DisplayFormatType::Verbose => { |
215 | 0 | let expr = PhysicalSortExpr::format_list(&self.expr); |
216 | 0 | let common_prefix_length = self.common_prefix_length; |
217 | 0 | match self.fetch { |
218 | 0 | Some(fetch) => { |
219 | 0 | write!(f, "PartialSortExec: TopK(fetch={fetch}), expr=[{expr}], common_prefix_length=[{common_prefix_length}]", ) |
220 | | } |
221 | 0 | None => write!(f, "PartialSortExec: expr=[{expr}], common_prefix_length=[{common_prefix_length}]"), |
222 | | } |
223 | | } |
224 | | } |
225 | 0 | } |
226 | | } |
227 | | |
228 | | impl ExecutionPlan for PartialSortExec { |
229 | 0 | fn name(&self) -> &'static str { |
230 | 0 | "PartialSortExec" |
231 | 0 | } |
232 | | |
233 | 0 | fn as_any(&self) -> &dyn Any { |
234 | 0 | self |
235 | 0 | } |
236 | | |
237 | 16 | fn properties(&self) -> &PlanProperties { |
238 | 16 | &self.cache |
239 | 16 | } |
240 | | |
241 | 0 | fn fetch(&self) -> Option<usize> { |
242 | 0 | self.fetch |
243 | 0 | } |
244 | | |
245 | 0 | fn required_input_distribution(&self) -> Vec<Distribution> { |
246 | 0 | if self.preserve_partitioning { |
247 | 0 | vec![Distribution::UnspecifiedDistribution] |
248 | | } else { |
249 | 0 | vec![Distribution::SinglePartition] |
250 | | } |
251 | 0 | } |
252 | | |
253 | 0 | fn benefits_from_input_partitioning(&self) -> Vec<bool> { |
254 | 0 | vec![false] |
255 | 0 | } |
256 | | |
257 | 0 | fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> { |
258 | 0 | vec![&self.input] |
259 | 0 | } |
260 | | |
261 | 0 | fn with_new_children( |
262 | 0 | self: Arc<Self>, |
263 | 0 | children: Vec<Arc<dyn ExecutionPlan>>, |
264 | 0 | ) -> Result<Arc<dyn ExecutionPlan>> { |
265 | 0 | let new_partial_sort = PartialSortExec::new( |
266 | 0 | self.expr.clone(), |
267 | 0 | Arc::clone(&children[0]), |
268 | 0 | self.common_prefix_length, |
269 | 0 | ) |
270 | 0 | .with_fetch(self.fetch) |
271 | 0 | .with_preserve_partitioning(self.preserve_partitioning); |
272 | 0 |
|
273 | 0 | Ok(Arc::new(new_partial_sort)) |
274 | 0 | } |
275 | | |
276 | 13 | fn execute( |
277 | 13 | &self, |
278 | 13 | partition: usize, |
279 | 13 | context: Arc<TaskContext>, |
280 | 13 | ) -> Result<SendableRecordBatchStream> { |
281 | 13 | trace!("Start PartialSortExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()0 ); |
282 | | |
283 | 13 | let input = self.input.execute(partition, Arc::clone(&context))?0 ; |
284 | | |
285 | 13 | trace!( |
286 | 0 | "End PartialSortExec's input.execute for partition: {}", |
287 | | partition |
288 | | ); |
289 | | |
290 | | // Make sure common prefix length is larger than 0 |
291 | | // Otherwise, we should use SortExec. |
292 | 13 | debug_assert!(self.common_prefix_length > 0); |
293 | | |
294 | 13 | Ok(Box::pin(PartialSortStream { |
295 | 13 | input, |
296 | 13 | expr: self.expr.clone(), |
297 | 13 | common_prefix_length: self.common_prefix_length, |
298 | 13 | in_mem_batches: vec![], |
299 | 13 | fetch: self.fetch, |
300 | 13 | is_closed: false, |
301 | 13 | baseline_metrics: BaselineMetrics::new(&self.metrics_set, partition), |
302 | 13 | })) |
303 | 13 | } |
304 | | |
305 | 1 | fn metrics(&self) -> Option<MetricsSet> { |
306 | 1 | Some(self.metrics_set.clone_inner()) |
307 | 1 | } |
308 | | |
309 | 0 | fn statistics(&self) -> Result<Statistics> { |
310 | 0 | self.input.statistics() |
311 | 0 | } |
312 | | } |
313 | | |
314 | | struct PartialSortStream { |
315 | | /// The input plan |
316 | | input: SendableRecordBatchStream, |
317 | | /// Sort expressions |
318 | | expr: Vec<PhysicalSortExpr>, |
319 | | /// Length of prefix common to input ordering and required ordering of plan |
320 | | /// should be more than 0 otherwise PartialSort is not applicable |
321 | | common_prefix_length: usize, |
322 | | /// Used as a buffer for part of the input not ready for sort |
323 | | in_mem_batches: Vec<RecordBatch>, |
324 | | /// Fetch top N results |
325 | | fetch: Option<usize>, |
326 | | /// Whether the stream has finished returning all of its data or not |
327 | | is_closed: bool, |
328 | | /// Execution metrics |
329 | | baseline_metrics: BaselineMetrics, |
330 | | } |
331 | | |
332 | | impl Stream for PartialSortStream { |
333 | | type Item = Result<RecordBatch>; |
334 | | |
335 | 42 | fn poll_next( |
336 | 42 | mut self: Pin<&mut Self>, |
337 | 42 | cx: &mut Context<'_>, |
338 | 42 | ) -> Poll<Option<Self::Item>> { |
339 | 42 | let poll = self.poll_next_inner(cx); |
340 | 42 | self.baseline_metrics.record_poll(poll) |
341 | 42 | } |
342 | | |
343 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
344 | 0 | // we can't predict the size of incoming batches so re-use the size hint from the input |
345 | 0 | self.input.size_hint() |
346 | 0 | } |
347 | | } |
348 | | |
349 | | impl RecordBatchStream for PartialSortStream { |
350 | 29 | fn schema(&self) -> SchemaRef { |
351 | 29 | self.input.schema() |
352 | 29 | } |
353 | | } |
354 | | |
355 | | impl PartialSortStream { |
356 | 42 | fn poll_next_inner( |
357 | 42 | self: &mut Pin<&mut Self>, |
358 | 42 | cx: &mut Context<'_>, |
359 | 42 | ) -> Poll<Option<Result<RecordBatch>>> { |
360 | 42 | if self.is_closed { |
361 | 12 | return Poll::Ready(None); |
362 | 30 | } |
363 | 30 | let result29 = match ready!1 (self.input.poll_next_unpin(cx)) { |
364 | 21 | Some(Ok(batch)) => { |
365 | 15 | if let Some(slice_point) = |
366 | 21 | self.get_slice_point(self.common_prefix_length, &batch)?0 |
367 | | { |
368 | 15 | self.in_mem_batches.push(batch.slice(0, slice_point)); |
369 | 15 | let remaining_batch = |
370 | 15 | batch.slice(slice_point, batch.num_rows() - slice_point); |
371 | 15 | let sorted_batch = self.sort_in_mem_batches(); |
372 | 15 | self.in_mem_batches.push(remaining_batch); |
373 | 15 | sorted_batch |
374 | | } else { |
375 | 6 | self.in_mem_batches.push(batch); |
376 | 6 | Ok(RecordBatch::new_empty(self.schema())) |
377 | | } |
378 | | } |
379 | 0 | Some(Err(e)) => Err(e), |
380 | | None => { |
381 | 8 | self.is_closed = true; |
382 | 8 | // once input is consumed, sort the rest of the inserted batches |
383 | 8 | self.sort_in_mem_batches() |
384 | | } |
385 | | }; |
386 | | |
387 | 29 | Poll::Ready(Some(result)) |
388 | 42 | } |
389 | | |
390 | | /// Returns a sorted RecordBatch from in_mem_batches and clears in_mem_batches |
391 | | /// |
392 | | /// If fetch is specified for PartialSortStream `sort_in_mem_batches` will limit |
393 | | /// the last RecordBatch returned and will mark the stream as closed |
394 | 23 | fn sort_in_mem_batches(self: &mut Pin<&mut Self>) -> Result<RecordBatch> { |
395 | 23 | let input_batch = concat_batches(&self.schema(), &self.in_mem_batches)?0 ; |
396 | 23 | self.in_mem_batches.clear(); |
397 | 23 | let result = sort_batch(&input_batch, &self.expr, self.fetch)?0 ; |
398 | 23 | if let Some(remaining_fetch10 ) = self.fetch { |
399 | | // remaining_fetch - result.num_rows() is always be >= 0 |
400 | | // because result length of sort_batch with limit cannot be |
401 | | // more than the requested limit |
402 | 10 | self.fetch = Some(remaining_fetch - result.num_rows()); |
403 | 10 | if remaining_fetch == result.num_rows() { |
404 | 6 | self.is_closed = true; |
405 | 6 | }4 |
406 | 13 | } |
407 | 23 | Ok(result) |
408 | 23 | } |
409 | | |
410 | | /// Return the end index of the second last partition if the batch |
411 | | /// can be partitioned based on its already sorted columns |
412 | | /// |
413 | | /// Return None if the batch cannot be partitioned, which means the |
414 | | /// batch does not have the information for a safe sort |
415 | 21 | fn get_slice_point( |
416 | 21 | &self, |
417 | 21 | common_prefix_len: usize, |
418 | 21 | batch: &RecordBatch, |
419 | 21 | ) -> Result<Option<usize>> { |
420 | 21 | let common_prefix_sort_keys = (0..common_prefix_len) |
421 | 25 | .map(|idx| self.expr[idx].evaluate_to_sort_column(batch)) |
422 | 21 | .collect::<Result<Vec<_>>>()?0 ; |
423 | 21 | let partition_points = |
424 | 21 | evaluate_partition_ranges(batch.num_rows(), &common_prefix_sort_keys)?0 ; |
425 | | // If partition points are [0..100], [100..200], [200..300] |
426 | | // we should return 200, which is the safest and furthest partition boundary |
427 | | // Please note that we shouldn't return 300 (which is number of rows in the batch), |
428 | | // because this boundary may change with new data. |
429 | 21 | if partition_points.len() >= 2 { |
430 | 15 | Ok(Some(partition_points[partition_points.len() - 2].end)) |
431 | | } else { |
432 | 6 | Ok(None) |
433 | | } |
434 | 21 | } |
435 | | } |
436 | | |
437 | | #[cfg(test)] |
438 | | mod tests { |
439 | | use std::collections::HashMap; |
440 | | |
441 | | use arrow::array::*; |
442 | | use arrow::compute::SortOptions; |
443 | | use arrow::datatypes::*; |
444 | | use futures::FutureExt; |
445 | | use itertools::Itertools; |
446 | | |
447 | | use datafusion_common::assert_batches_eq; |
448 | | |
449 | | use crate::collect; |
450 | | use crate::expressions::col; |
451 | | use crate::memory::MemoryExec; |
452 | | use crate::sorts::sort::SortExec; |
453 | | use crate::test; |
454 | | use crate::test::assert_is_pending; |
455 | | use crate::test::exec::{assert_strong_count_converges_to_zero, BlockingExec}; |
456 | | |
457 | | use super::*; |
458 | | |
459 | | #[tokio::test] |
460 | 1 | async fn test_partial_sort() -> Result<()> { |
461 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
462 | 1 | let source = test::build_table_scan_i32( |
463 | 1 | ("a", &vec![0, 0, 0, 1, 1, 1]), |
464 | 1 | ("b", &vec![1, 1, 2, 2, 3, 3]), |
465 | 1 | ("c", &vec![1, 0, 5, 4, 3, 2]), |
466 | 1 | ); |
467 | 1 | let schema = Schema::new(vec![ |
468 | 1 | Field::new("a", DataType::Int32, false), |
469 | 1 | Field::new("b", DataType::Int32, false), |
470 | 1 | Field::new("c", DataType::Int32, false), |
471 | 1 | ]); |
472 | 1 | let option_asc = SortOptions { |
473 | 1 | descending: false, |
474 | 1 | nulls_first: false, |
475 | 1 | }; |
476 | 1 | |
477 | 1 | let partial_sort_exec = Arc::new(PartialSortExec::new( |
478 | 1 | vec![ |
479 | 1 | PhysicalSortExpr { |
480 | 1 | expr: col("a", &schema)?0 , |
481 | 1 | options: option_asc, |
482 | 1 | }, |
483 | 1 | PhysicalSortExpr { |
484 | 1 | expr: col("b", &schema)?0 , |
485 | 1 | options: option_asc, |
486 | 1 | }, |
487 | 1 | PhysicalSortExpr { |
488 | 1 | expr: col("c", &schema)?0 , |
489 | 1 | options: option_asc, |
490 | 1 | }, |
491 | 1 | ], |
492 | 1 | Arc::clone(&source), |
493 | 1 | 2, |
494 | 1 | )) as Arc<dyn ExecutionPlan>; |
495 | 1 | |
496 | 1 | let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
497 | 1 | |
498 | 1 | let expected_after_sort = [ |
499 | 1 | "+---+---+---+", |
500 | 1 | "| a | b | c |", |
501 | 1 | "+---+---+---+", |
502 | 1 | "| 0 | 1 | 0 |", |
503 | 1 | "| 0 | 1 | 1 |", |
504 | 1 | "| 0 | 2 | 5 |", |
505 | 1 | "| 1 | 2 | 4 |", |
506 | 1 | "| 1 | 3 | 2 |", |
507 | 1 | "| 1 | 3 | 3 |", |
508 | 1 | "+---+---+---+", |
509 | 1 | ]; |
510 | 1 | assert_eq!(2, result.len()); |
511 | 1 | assert_batches_eq!(expected_after_sort, &result); |
512 | 1 | assert_eq!( |
513 | 1 | task_ctx.runtime_env().memory_pool.reserved(), |
514 | 1 | 0, |
515 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
516 | 1 | ); |
517 | 1 | |
518 | 1 | Ok(()) |
519 | 1 | } |
520 | | |
521 | | #[tokio::test] |
522 | 1 | async fn test_partial_sort_with_fetch() -> Result<()> { |
523 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
524 | 1 | let source = test::build_table_scan_i32( |
525 | 1 | ("a", &vec![0, 0, 1, 1, 1]), |
526 | 1 | ("b", &vec![1, 2, 2, 3, 3]), |
527 | 1 | ("c", &vec![4, 3, 2, 1, 0]), |
528 | 1 | ); |
529 | 1 | let schema = Schema::new(vec![ |
530 | 1 | Field::new("a", DataType::Int32, false), |
531 | 1 | Field::new("b", DataType::Int32, false), |
532 | 1 | Field::new("c", DataType::Int32, false), |
533 | 1 | ]); |
534 | 1 | let option_asc = SortOptions { |
535 | 1 | descending: false, |
536 | 1 | nulls_first: false, |
537 | 1 | }; |
538 | 1 | |
539 | 3 | for common_prefix_length2 in [1, 2] { |
540 | 2 | let partial_sort_exec = Arc::new( |
541 | 1 | PartialSortExec::new( |
542 | 2 | vec![ |
543 | 2 | PhysicalSortExpr { |
544 | 2 | expr: col("a", &schema)?0 , |
545 | 2 | options: option_asc, |
546 | 2 | }, |
547 | 2 | PhysicalSortExpr { |
548 | 2 | expr: col("b", &schema)?0 , |
549 | 2 | options: option_asc, |
550 | 2 | }, |
551 | 2 | PhysicalSortExpr { |
552 | 2 | expr: col("c", &schema)?0 , |
553 | 2 | options: option_asc, |
554 | 2 | }, |
555 | 2 | ], |
556 | 2 | Arc::clone(&source), |
557 | 2 | common_prefix_length, |
558 | 2 | ) |
559 | 2 | .with_fetch(Some(4)), |
560 | 1 | ) as Arc<dyn ExecutionPlan>; |
561 | 1 | |
562 | 2 | let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
563 | 1 | |
564 | 2 | let expected_after_sort = [ |
565 | 2 | "+---+---+---+", |
566 | 2 | "| a | b | c |", |
567 | 2 | "+---+---+---+", |
568 | 2 | "| 0 | 1 | 4 |", |
569 | 2 | "| 0 | 2 | 3 |", |
570 | 2 | "| 1 | 2 | 2 |", |
571 | 2 | "| 1 | 3 | 0 |", |
572 | 2 | "+---+---+---+", |
573 | 2 | ]; |
574 | 2 | assert_eq!(2, result.len()); |
575 | 2 | assert_batches_eq!(expected_after_sort, &result); |
576 | 2 | assert_eq!( |
577 | 2 | task_ctx.runtime_env().memory_pool.reserved(), |
578 | 1 | 0, |
579 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
580 | 1 | ); |
581 | 1 | } |
582 | 1 | |
583 | 1 | Ok(()) |
584 | 1 | } |
585 | | |
586 | | #[tokio::test] |
587 | 1 | async fn test_partial_sort2() -> Result<()> { |
588 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
589 | 1 | let source_tables = [ |
590 | 1 | test::build_table_scan_i32( |
591 | 1 | ("a", &vec![0, 0, 0, 0, 1, 1, 1, 1]), |
592 | 1 | ("b", &vec![1, 1, 3, 3, 4, 4, 2, 2]), |
593 | 1 | ("c", &vec![7, 6, 5, 4, 3, 2, 1, 0]), |
594 | 1 | ), |
595 | 1 | test::build_table_scan_i32( |
596 | 1 | ("a", &vec![0, 0, 0, 0, 1, 1, 1, 1]), |
597 | 1 | ("b", &vec![1, 1, 3, 3, 2, 2, 4, 4]), |
598 | 1 | ("c", &vec![7, 6, 5, 4, 1, 0, 3, 2]), |
599 | 1 | ), |
600 | 1 | ]; |
601 | 1 | let schema = Schema::new(vec![ |
602 | 1 | Field::new("a", DataType::Int32, false), |
603 | 1 | Field::new("b", DataType::Int32, false), |
604 | 1 | Field::new("c", DataType::Int32, false), |
605 | 1 | ]); |
606 | 1 | let option_asc = SortOptions { |
607 | 1 | descending: false, |
608 | 1 | nulls_first: false, |
609 | 1 | }; |
610 | 2 | for (common_prefix_length, source) in |
611 | 1 | [(1, &source_tables[0]), (2, &source_tables[1])] |
612 | 1 | { |
613 | 2 | let partial_sort_exec = Arc::new(PartialSortExec::new( |
614 | 2 | vec![ |
615 | 2 | PhysicalSortExpr { |
616 | 2 | expr: col("a", &schema)?0 , |
617 | 2 | options: option_asc, |
618 | 2 | }, |
619 | 2 | PhysicalSortExpr { |
620 | 2 | expr: col("b", &schema)?0 , |
621 | 2 | options: option_asc, |
622 | 2 | }, |
623 | 2 | PhysicalSortExpr { |
624 | 2 | expr: col("c", &schema)?0 , |
625 | 2 | options: option_asc, |
626 | 2 | }, |
627 | 2 | ], |
628 | 2 | Arc::clone(source), |
629 | 2 | common_prefix_length, |
630 | 1 | )); |
631 | 1 | |
632 | 2 | let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
633 | 2 | assert_eq!(2, result.len()); |
634 | 2 | assert_eq!( |
635 | 2 | task_ctx.runtime_env().memory_pool.reserved(), |
636 | 1 | 0, |
637 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
638 | 1 | ); |
639 | 2 | let expected = [ |
640 | 2 | "+---+---+---+", |
641 | 2 | "| a | b | c |", |
642 | 2 | "+---+---+---+", |
643 | 2 | "| 0 | 1 | 6 |", |
644 | 2 | "| 0 | 1 | 7 |", |
645 | 2 | "| 0 | 3 | 4 |", |
646 | 2 | "| 0 | 3 | 5 |", |
647 | 2 | "| 1 | 2 | 0 |", |
648 | 2 | "| 1 | 2 | 1 |", |
649 | 2 | "| 1 | 4 | 2 |", |
650 | 2 | "| 1 | 4 | 3 |", |
651 | 2 | "+---+---+---+", |
652 | 2 | ]; |
653 | 2 | assert_batches_eq!(expected, &result); |
654 | 1 | } |
655 | 1 | Ok(()) |
656 | 1 | } |
657 | | |
658 | 2 | fn prepare_partitioned_input() -> Arc<dyn ExecutionPlan> { |
659 | 2 | let batch1 = test::build_table_i32( |
660 | 2 | ("a", &vec![1; 100]), |
661 | 2 | ("b", &(0..100).rev().collect()), |
662 | 2 | ("c", &(0..100).rev().collect()), |
663 | 2 | ); |
664 | 2 | let batch2 = test::build_table_i32( |
665 | 2 | ("a", &[&vec![1; 25][..], &vec![2; 75][..]].concat()), |
666 | 2 | ("b", &(100..200).rev().collect()), |
667 | 2 | ("c", &(0..100).collect()), |
668 | 2 | ); |
669 | 2 | let batch3 = test::build_table_i32( |
670 | 2 | ("a", &[&vec![3; 50][..], &vec![4; 50][..]].concat()), |
671 | 2 | ("b", &(150..250).rev().collect()), |
672 | 2 | ("c", &(0..100).rev().collect()), |
673 | 2 | ); |
674 | 2 | let batch4 = test::build_table_i32( |
675 | 2 | ("a", &vec![4; 100]), |
676 | 2 | ("b", &(50..150).rev().collect()), |
677 | 2 | ("c", &(0..100).rev().collect()), |
678 | 2 | ); |
679 | 2 | let schema = batch1.schema(); |
680 | 2 | Arc::new( |
681 | 2 | MemoryExec::try_new( |
682 | 2 | &[vec![batch1, batch2, batch3, batch4]], |
683 | 2 | Arc::clone(&schema), |
684 | 2 | None, |
685 | 2 | ) |
686 | 2 | .unwrap(), |
687 | 2 | ) as Arc<dyn ExecutionPlan> |
688 | 2 | } |
689 | | |
690 | | #[tokio::test] |
691 | 1 | async fn test_partitioned_input_partial_sort() -> Result<()> { |
692 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
693 | 1 | let mem_exec = prepare_partitioned_input(); |
694 | 1 | let option_asc = SortOptions { |
695 | 1 | descending: false, |
696 | 1 | nulls_first: false, |
697 | 1 | }; |
698 | 1 | let option_desc = SortOptions { |
699 | 1 | descending: false, |
700 | 1 | nulls_first: false, |
701 | 1 | }; |
702 | 1 | let schema = mem_exec.schema(); |
703 | 1 | let partial_sort_executor = PartialSortExec::new( |
704 | 1 | vec![ |
705 | 1 | PhysicalSortExpr { |
706 | 1 | expr: col("a", &schema)?0 , |
707 | 1 | options: option_asc, |
708 | 1 | }, |
709 | 1 | PhysicalSortExpr { |
710 | 1 | expr: col("b", &schema)?0 , |
711 | 1 | options: option_desc, |
712 | 1 | }, |
713 | 1 | PhysicalSortExpr { |
714 | 1 | expr: col("c", &schema)?0 , |
715 | 1 | options: option_asc, |
716 | 1 | }, |
717 | 1 | ], |
718 | 1 | Arc::clone(&mem_exec), |
719 | 1 | 1, |
720 | 1 | ); |
721 | 1 | let partial_sort_exec = |
722 | 1 | Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>; |
723 | 1 | let sort_exec = Arc::new(SortExec::new( |
724 | 1 | partial_sort_executor.expr, |
725 | 1 | partial_sort_executor.input, |
726 | 1 | )) as Arc<dyn ExecutionPlan>; |
727 | 1 | let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
728 | 1 | assert_eq!( |
729 | 5 | result.iter().map(|r| r.num_rows()).collect_vec(), |
730 | 1 | [0, 125, 125, 0, 150] |
731 | 1 | ); |
732 | 1 | |
733 | 1 | assert_eq!( |
734 | 1 | task_ctx.runtime_env().memory_pool.reserved(), |
735 | 1 | 0, |
736 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
737 | 1 | ); |
738 | 1 | let partial_sort_result = concat_batches(&schema, &result).unwrap(); |
739 | 1 | let sort_result = collect(sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
740 | 1 | assert_eq!(sort_result[0], partial_sort_result); |
741 | 1 | |
742 | 1 | Ok(()) |
743 | 1 | } |
744 | | |
745 | | #[tokio::test] |
746 | 1 | async fn test_partitioned_input_partial_sort_with_fetch() -> Result<()> { |
747 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
748 | 1 | let mem_exec = prepare_partitioned_input(); |
749 | 1 | let schema = mem_exec.schema(); |
750 | 1 | let option_asc = SortOptions { |
751 | 1 | descending: false, |
752 | 1 | nulls_first: false, |
753 | 1 | }; |
754 | 1 | let option_desc = SortOptions { |
755 | 1 | descending: false, |
756 | 1 | nulls_first: false, |
757 | 1 | }; |
758 | 4 | for (fetch_size, expected_batch_num_rows) in [ |
759 | 1 | (Some(50), vec![0, 50]), |
760 | 1 | (Some(120), vec![0, 120]), |
761 | 1 | (Some(150), vec![0, 125, 25]), |
762 | 1 | (Some(250), vec![0, 125, 125]), |
763 | 1 | ] { |
764 | 4 | let partial_sort_executor = PartialSortExec::new( |
765 | 4 | vec![ |
766 | 4 | PhysicalSortExpr { |
767 | 4 | expr: col("a", &schema)?0 , |
768 | 4 | options: option_asc, |
769 | 4 | }, |
770 | 4 | PhysicalSortExpr { |
771 | 4 | expr: col("b", &schema)?0 , |
772 | 4 | options: option_desc, |
773 | 4 | }, |
774 | 4 | PhysicalSortExpr { |
775 | 4 | expr: col("c", &schema)?0 , |
776 | 4 | options: option_asc, |
777 | 4 | }, |
778 | 4 | ], |
779 | 4 | Arc::clone(&mem_exec), |
780 | 4 | 1, |
781 | 4 | ) |
782 | 4 | .with_fetch(fetch_size); |
783 | 4 | |
784 | 4 | let partial_sort_exec = |
785 | 4 | Arc::new(partial_sort_executor.clone()) as Arc<dyn ExecutionPlan>; |
786 | 4 | let sort_exec = Arc::new( |
787 | 4 | SortExec::new(partial_sort_executor.expr, partial_sort_executor.input) |
788 | 4 | .with_fetch(fetch_size), |
789 | 4 | ) as Arc<dyn ExecutionPlan>; |
790 | 4 | let result = collect(partial_sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
791 | 4 | assert_eq!( |
792 | 10 | result.iter().map(|r| r.num_rows()).collect_vec(), |
793 | 4 | expected_batch_num_rows |
794 | 4 | ); |
795 | 1 | |
796 | 4 | assert_eq!( |
797 | 4 | task_ctx.runtime_env().memory_pool.reserved(), |
798 | 1 | 0, |
799 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
800 | 1 | ); |
801 | 4 | let partial_sort_result = concat_batches(&schema, &result)?0 ; |
802 | 4 | let sort_result = collect(sort_exec, Arc::clone(&task_ctx)).await0 ?0 ; |
803 | 4 | assert_eq!(sort_result[0], partial_sort_result); |
804 | 1 | } |
805 | 1 | |
806 | 1 | Ok(()) |
807 | 1 | } |
808 | | |
809 | | #[tokio::test] |
810 | 1 | async fn test_sort_metadata() -> Result<()> { |
811 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
812 | 1 | let field_metadata: HashMap<String, String> = |
813 | 1 | vec![("foo".to_string(), "bar".to_string())] |
814 | 1 | .into_iter() |
815 | 1 | .collect(); |
816 | 1 | let schema_metadata: HashMap<String, String> = |
817 | 1 | vec![("baz".to_string(), "barf".to_string())] |
818 | 1 | .into_iter() |
819 | 1 | .collect(); |
820 | 1 | |
821 | 1 | let mut field = Field::new("field_name", DataType::UInt64, true); |
822 | 1 | field.set_metadata(field_metadata.clone()); |
823 | 1 | let schema = Schema::new_with_metadata(vec![field], schema_metadata.clone()); |
824 | 1 | let schema = Arc::new(schema); |
825 | 1 | |
826 | 1 | let data: ArrayRef = |
827 | 1 | Arc::new(vec![1, 1, 2].into_iter().map(Some).collect::<UInt64Array>()); |
828 | 1 | |
829 | 1 | let batch = RecordBatch::try_new(Arc::clone(&schema), vec![data])?0 ; |
830 | 1 | let input = Arc::new(MemoryExec::try_new( |
831 | 1 | &[vec![batch]], |
832 | 1 | Arc::clone(&schema), |
833 | 1 | None, |
834 | 1 | )?0 ); |
835 | 1 | |
836 | 1 | let partial_sort_exec = Arc::new(PartialSortExec::new( |
837 | 1 | vec![PhysicalSortExpr { |
838 | 1 | expr: col("field_name", &schema)?0 , |
839 | 1 | options: SortOptions::default(), |
840 | 1 | }], |
841 | 1 | input, |
842 | 1 | 1, |
843 | 1 | )); |
844 | 1 | |
845 | 1 | let result: Vec<RecordBatch> = collect(partial_sort_exec, task_ctx).await0 ?0 ; |
846 | 1 | let expected_batch = vec![ |
847 | 1 | RecordBatch::try_new( |
848 | 1 | Arc::clone(&schema), |
849 | 1 | vec![Arc::new( |
850 | 1 | vec![1, 1].into_iter().map(Some).collect::<UInt64Array>(), |
851 | 1 | )], |
852 | 1 | )?0 , |
853 | 1 | RecordBatch::try_new( |
854 | 1 | Arc::clone(&schema), |
855 | 1 | vec![Arc::new( |
856 | 1 | vec![2].into_iter().map(Some).collect::<UInt64Array>(), |
857 | 1 | )], |
858 | 1 | )?0 , |
859 | 1 | ]; |
860 | 1 | |
861 | 1 | // Data is correct |
862 | 1 | assert_eq!(&expected_batch, &result); |
863 | 1 | |
864 | 1 | // explicitly ensure the metadata is present |
865 | 1 | assert_eq!(result[0].schema().fields()[0].metadata(), &field_metadata); |
866 | 1 | assert_eq!(result[0].schema().metadata(), &schema_metadata); |
867 | 1 | |
868 | 1 | Ok(()) |
869 | 1 | } |
870 | | |
871 | | #[tokio::test] |
872 | 1 | async fn test_lex_sort_by_float() -> Result<()> { |
873 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
874 | 1 | let schema = Arc::new(Schema::new(vec![ |
875 | 1 | Field::new("a", DataType::Float32, true), |
876 | 1 | Field::new("b", DataType::Float64, true), |
877 | 1 | Field::new("c", DataType::Float64, true), |
878 | 1 | ])); |
879 | 1 | let option_asc = SortOptions { |
880 | 1 | descending: false, |
881 | 1 | nulls_first: true, |
882 | 1 | }; |
883 | 1 | let option_desc = SortOptions { |
884 | 1 | descending: true, |
885 | 1 | nulls_first: true, |
886 | 1 | }; |
887 | 1 | |
888 | 1 | // define data. |
889 | 1 | let batch = RecordBatch::try_new( |
890 | 1 | Arc::clone(&schema), |
891 | 1 | vec![ |
892 | 1 | Arc::new(Float32Array::from(vec![ |
893 | 1 | Some(1.0_f32), |
894 | 1 | Some(1.0_f32), |
895 | 1 | Some(1.0_f32), |
896 | 1 | Some(2.0_f32), |
897 | 1 | Some(2.0_f32), |
898 | 1 | Some(3.0_f32), |
899 | 1 | Some(3.0_f32), |
900 | 1 | Some(3.0_f32), |
901 | 1 | ])), |
902 | 1 | Arc::new(Float64Array::from(vec![ |
903 | 1 | Some(20.0_f64), |
904 | 1 | Some(20.0_f64), |
905 | 1 | Some(40.0_f64), |
906 | 1 | Some(40.0_f64), |
907 | 1 | Some(f64::NAN), |
908 | 1 | None, |
909 | 1 | None, |
910 | 1 | Some(f64::NAN), |
911 | 1 | ])), |
912 | 1 | Arc::new(Float64Array::from(vec![ |
913 | 1 | Some(10.0_f64), |
914 | 1 | Some(20.0_f64), |
915 | 1 | Some(10.0_f64), |
916 | 1 | Some(100.0_f64), |
917 | 1 | Some(f64::NAN), |
918 | 1 | Some(100.0_f64), |
919 | 1 | None, |
920 | 1 | Some(f64::NAN), |
921 | 1 | ])), |
922 | 1 | ], |
923 | 1 | )?0 ; |
924 | 1 | |
925 | 1 | let partial_sort_exec = Arc::new(PartialSortExec::new( |
926 | 1 | vec![ |
927 | 1 | PhysicalSortExpr { |
928 | 1 | expr: col("a", &schema)?0 , |
929 | 1 | options: option_asc, |
930 | 1 | }, |
931 | 1 | PhysicalSortExpr { |
932 | 1 | expr: col("b", &schema)?0 , |
933 | 1 | options: option_asc, |
934 | 1 | }, |
935 | 1 | PhysicalSortExpr { |
936 | 1 | expr: col("c", &schema)?0 , |
937 | 1 | options: option_desc, |
938 | 1 | }, |
939 | 1 | ], |
940 | 1 | Arc::new(MemoryExec::try_new(&[vec![batch]], schema, None)?0 ), |
941 | 1 | 2, |
942 | 1 | )); |
943 | 1 | |
944 | 1 | let expected = [ |
945 | 1 | "+-----+------+-------+", |
946 | 1 | "| a | b | c |", |
947 | 1 | "+-----+------+-------+", |
948 | 1 | "| 1.0 | 20.0 | 20.0 |", |
949 | 1 | "| 1.0 | 20.0 | 10.0 |", |
950 | 1 | "| 1.0 | 40.0 | 10.0 |", |
951 | 1 | "| 2.0 | 40.0 | 100.0 |", |
952 | 1 | "| 2.0 | NaN | NaN |", |
953 | 1 | "| 3.0 | | |", |
954 | 1 | "| 3.0 | | 100.0 |", |
955 | 1 | "| 3.0 | NaN | NaN |", |
956 | 1 | "+-----+------+-------+", |
957 | 1 | ]; |
958 | 1 | |
959 | 1 | assert_eq!( |
960 | 1 | DataType::Float32, |
961 | 1 | *partial_sort_exec.schema().field(0).data_type() |
962 | 1 | ); |
963 | 1 | assert_eq!( |
964 | 1 | DataType::Float64, |
965 | 1 | *partial_sort_exec.schema().field(1).data_type() |
966 | 1 | ); |
967 | 1 | assert_eq!( |
968 | 1 | DataType::Float64, |
969 | 1 | *partial_sort_exec.schema().field(2).data_type() |
970 | 1 | ); |
971 | 1 | |
972 | 1 | let result: Vec<RecordBatch> = collect( |
973 | 1 | Arc::clone(&partial_sort_exec) as Arc<dyn ExecutionPlan>, |
974 | 1 | task_ctx, |
975 | 1 | ) |
976 | 1 | .await0 ?0 ; |
977 | 1 | assert_batches_eq!(expected, &result); |
978 | 1 | assert_eq!(result.len(), 2); |
979 | 1 | let metrics = partial_sort_exec.metrics().unwrap(); |
980 | 1 | assert!(metrics.elapsed_compute().unwrap() > 0); |
981 | 1 | assert_eq!(metrics.output_rows().unwrap(), 8); |
982 | 1 | |
983 | 1 | let columns = result[0].columns(); |
984 | 1 | |
985 | 1 | assert_eq!(DataType::Float32, *columns[0].data_type()); |
986 | 1 | assert_eq!(DataType::Float64, *columns[1].data_type()); |
987 | 1 | assert_eq!(DataType::Float64, *columns[2].data_type()); |
988 | 1 | |
989 | 1 | Ok(()) |
990 | 1 | } |
991 | | |
992 | | #[tokio::test] |
993 | 1 | async fn test_drop_cancel() -> Result<()> { |
994 | 1 | let task_ctx = Arc::new(TaskContext::default()); |
995 | 1 | let schema = Arc::new(Schema::new(vec![ |
996 | 1 | Field::new("a", DataType::Float32, true), |
997 | 1 | Field::new("b", DataType::Float32, true), |
998 | 1 | ])); |
999 | 1 | |
1000 | 1 | let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 1)); |
1001 | 1 | let refs = blocking_exec.refs(); |
1002 | 1 | let sort_exec = Arc::new(PartialSortExec::new( |
1003 | 1 | vec![PhysicalSortExpr { |
1004 | 1 | expr: col("a", &schema)?0 , |
1005 | 1 | options: SortOptions::default(), |
1006 | 1 | }], |
1007 | 1 | blocking_exec, |
1008 | 1 | 1, |
1009 | 1 | )); |
1010 | 1 | |
1011 | 1 | let fut = collect(sort_exec, Arc::clone(&task_ctx)); |
1012 | 1 | let mut fut = fut.boxed(); |
1013 | 1 | |
1014 | 1 | assert_is_pending(&mut fut); |
1015 | 1 | drop(fut); |
1016 | 1 | assert_strong_count_converges_to_zero(refs).await0 ; |
1017 | 1 | |
1018 | 1 | assert_eq!( |
1019 | 1 | task_ctx.runtime_env().memory_pool.reserved(), |
1020 | 1 | 0, |
1021 | 1 | "The sort should have returned all memory used back to the memory manager"0 |
1022 | 1 | ); |
1023 | 1 | |
1024 | 1 | Ok(()) |
1025 | 1 | } |
1026 | | } |