/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/sorts/streaming_merge.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Merge that deals with an arbitrary size of streaming inputs. |
19 | | //! This is an order-preserving merge. |
20 | | |
21 | | use crate::metrics::BaselineMetrics; |
22 | | use crate::sorts::{ |
23 | | merge::SortPreservingMergeStream, |
24 | | stream::{FieldCursorStream, RowCursorStream}, |
25 | | }; |
26 | | use crate::{PhysicalSortExpr, SendableRecordBatchStream}; |
27 | | use arrow::datatypes::{DataType, SchemaRef}; |
28 | | use arrow_array::*; |
29 | | use datafusion_common::{internal_err, Result}; |
30 | | use datafusion_execution::memory_pool::MemoryReservation; |
31 | | |
32 | | macro_rules! primitive_merge_helper { |
33 | | ($t:ty, $($v:ident),+) => { |
34 | | merge_helper!(PrimitiveArray<$t>, $($v),+) |
35 | | }; |
36 | | } |
37 | | |
38 | | macro_rules! merge_helper { |
39 | | ($t:ty, $sort:ident, $streams:ident, $schema:ident, $tracking_metrics:ident, $batch_size:ident, $fetch:ident, $reservation:ident) => {{ |
40 | | let streams = FieldCursorStream::<$t>::new($sort, $streams); |
41 | | return Ok(Box::pin(SortPreservingMergeStream::new( |
42 | | Box::new(streams), |
43 | | $schema, |
44 | | $tracking_metrics, |
45 | | $batch_size, |
46 | | $fetch, |
47 | | $reservation, |
48 | | ))); |
49 | | }}; |
50 | | } |
51 | | |
52 | | #[derive(Default)] |
53 | | pub struct StreamingMergeBuilder<'a> { |
54 | | streams: Vec<SendableRecordBatchStream>, |
55 | | schema: Option<SchemaRef>, |
56 | | expressions: &'a [PhysicalSortExpr], |
57 | | metrics: Option<BaselineMetrics>, |
58 | | batch_size: Option<usize>, |
59 | | fetch: Option<usize>, |
60 | | reservation: Option<MemoryReservation>, |
61 | | } |
62 | | |
63 | | impl<'a> StreamingMergeBuilder<'a> { |
64 | 20 | pub fn new() -> Self { |
65 | 20 | Self::default() |
66 | 20 | } |
67 | | |
68 | 20 | pub fn with_streams(mut self, streams: Vec<SendableRecordBatchStream>) -> Self { |
69 | 20 | self.streams = streams; |
70 | 20 | self |
71 | 20 | } |
72 | | |
73 | 20 | pub fn with_schema(mut self, schema: SchemaRef) -> Self { |
74 | 20 | self.schema = Some(schema); |
75 | 20 | self |
76 | 20 | } |
77 | | |
78 | 20 | pub fn with_expressions(mut self, expressions: &'a [PhysicalSortExpr]) -> Self { |
79 | 20 | self.expressions = expressions; |
80 | 20 | self |
81 | 20 | } |
82 | | |
83 | 20 | pub fn with_metrics(mut self, metrics: BaselineMetrics) -> Self { |
84 | 20 | self.metrics = Some(metrics); |
85 | 20 | self |
86 | 20 | } |
87 | | |
88 | 20 | pub fn with_batch_size(mut self, batch_size: usize) -> Self { |
89 | 20 | self.batch_size = Some(batch_size); |
90 | 20 | self |
91 | 20 | } |
92 | | |
93 | 16 | pub fn with_fetch(mut self, fetch: Option<usize>) -> Self { |
94 | 16 | self.fetch = fetch; |
95 | 16 | self |
96 | 16 | } |
97 | | |
98 | 20 | pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self { |
99 | 20 | self.reservation = Some(reservation); |
100 | 20 | self |
101 | 20 | } |
102 | | |
103 | 20 | pub fn build(self) -> Result<SendableRecordBatchStream> { |
104 | 20 | let Self { |
105 | 20 | streams, |
106 | 20 | schema, |
107 | 20 | metrics, |
108 | 20 | batch_size, |
109 | 20 | reservation, |
110 | 20 | fetch, |
111 | 20 | expressions, |
112 | 20 | } = self; |
113 | 20 | |
114 | 20 | // Early return if streams or expressions are empty |
115 | 20 | let checks = [ |
116 | 20 | ( |
117 | 20 | streams.is_empty(), |
118 | 20 | "Streams cannot be empty for streaming merge", |
119 | 20 | ), |
120 | 20 | ( |
121 | 20 | expressions.is_empty(), |
122 | 20 | "Sort expressions cannot be empty for streaming merge", |
123 | 20 | ), |
124 | 20 | ]; |
125 | | |
126 | 40 | if let Some((_, error_message1 )) = checks.iter().find(20 |(condition, _)| *condition)20 |
127 | | { |
128 | 1 | return internal_err!("{}", error_message); |
129 | 19 | } |
130 | 19 | |
131 | 19 | // Unwrapping mandatory fields |
132 | 19 | let schema = schema.expect("Schema cannot be empty for streaming merge"); |
133 | 19 | let metrics = metrics.expect("Metrics cannot be empty for streaming merge"); |
134 | 19 | let batch_size = |
135 | 19 | batch_size.expect("Batch size cannot be empty for streaming merge"); |
136 | 19 | let reservation = |
137 | 19 | reservation.expect("Reservation cannot be empty for streaming merge"); |
138 | 19 | |
139 | 19 | // Special case single column comparisons with optimized cursor implementations |
140 | 19 | if expressions.len() == 1 { |
141 | 14 | let sort = expressions[0].clone(); |
142 | 14 | let data_type = sort.expr.data_type(schema.as_ref())?0 ; |
143 | 0 | downcast_primitive! { |
144 | 0 | data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation), |
145 | 2 | DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) |
146 | 0 | DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) |
147 | 0 | DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) |
148 | 0 | DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) |
149 | 0 | _ => {} |
150 | | } |
151 | 5 | } |
152 | | |
153 | 5 | let streams = RowCursorStream::try_new( |
154 | 5 | schema.as_ref(), |
155 | 5 | expressions, |
156 | 5 | streams, |
157 | 5 | reservation.new_empty(), |
158 | 5 | )?0 ; |
159 | 5 | Ok(Box::pin(SortPreservingMergeStream::new( |
160 | 5 | Box::new(streams), |
161 | 5 | schema, |
162 | 5 | metrics, |
163 | 5 | batch_size, |
164 | 5 | fetch, |
165 | 5 | reservation, |
166 | 5 | ))) |
167 | 20 | } |
168 | | } |