Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/sorts/streaming_merge.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Merge that deals with an arbitrary size of streaming inputs.
19
//! This is an order-preserving merge.
20
21
use crate::metrics::BaselineMetrics;
22
use crate::sorts::{
23
    merge::SortPreservingMergeStream,
24
    stream::{FieldCursorStream, RowCursorStream},
25
};
26
use crate::{PhysicalSortExpr, SendableRecordBatchStream};
27
use arrow::datatypes::{DataType, SchemaRef};
28
use arrow_array::*;
29
use datafusion_common::{internal_err, Result};
30
use datafusion_execution::memory_pool::MemoryReservation;
31
32
macro_rules! primitive_merge_helper {
33
    ($t:ty, $($v:ident),+) => {
34
        merge_helper!(PrimitiveArray<$t>, $($v),+)
35
    };
36
}
37
38
macro_rules! merge_helper {
39
    ($t:ty, $sort:ident, $streams:ident, $schema:ident, $tracking_metrics:ident, $batch_size:ident, $fetch:ident, $reservation:ident) => {{
40
        let streams = FieldCursorStream::<$t>::new($sort, $streams);
41
        return Ok(Box::pin(SortPreservingMergeStream::new(
42
            Box::new(streams),
43
            $schema,
44
            $tracking_metrics,
45
            $batch_size,
46
            $fetch,
47
            $reservation,
48
        )));
49
    }};
50
}
51
52
#[derive(Default)]
53
pub struct StreamingMergeBuilder<'a> {
54
    streams: Vec<SendableRecordBatchStream>,
55
    schema: Option<SchemaRef>,
56
    expressions: &'a [PhysicalSortExpr],
57
    metrics: Option<BaselineMetrics>,
58
    batch_size: Option<usize>,
59
    fetch: Option<usize>,
60
    reservation: Option<MemoryReservation>,
61
}
62
63
impl<'a> StreamingMergeBuilder<'a> {
64
20
    pub fn new() -> Self {
65
20
        Self::default()
66
20
    }
67
68
20
    pub fn with_streams(mut self, streams: Vec<SendableRecordBatchStream>) -> Self {
69
20
        self.streams = streams;
70
20
        self
71
20
    }
72
73
20
    pub fn with_schema(mut self, schema: SchemaRef) -> Self {
74
20
        self.schema = Some(schema);
75
20
        self
76
20
    }
77
78
20
    pub fn with_expressions(mut self, expressions: &'a [PhysicalSortExpr]) -> Self {
79
20
        self.expressions = expressions;
80
20
        self
81
20
    }
82
83
20
    pub fn with_metrics(mut self, metrics: BaselineMetrics) -> Self {
84
20
        self.metrics = Some(metrics);
85
20
        self
86
20
    }
87
88
20
    pub fn with_batch_size(mut self, batch_size: usize) -> Self {
89
20
        self.batch_size = Some(batch_size);
90
20
        self
91
20
    }
92
93
16
    pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
94
16
        self.fetch = fetch;
95
16
        self
96
16
    }
97
98
20
    pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self {
99
20
        self.reservation = Some(reservation);
100
20
        self
101
20
    }
102
103
20
    pub fn build(self) -> Result<SendableRecordBatchStream> {
104
20
        let Self {
105
20
            streams,
106
20
            schema,
107
20
            metrics,
108
20
            batch_size,
109
20
            reservation,
110
20
            fetch,
111
20
            expressions,
112
20
        } = self;
113
20
114
20
        // Early return if streams or expressions are empty
115
20
        let checks = [
116
20
            (
117
20
                streams.is_empty(),
118
20
                "Streams cannot be empty for streaming merge",
119
20
            ),
120
20
            (
121
20
                expressions.is_empty(),
122
20
                "Sort expressions cannot be empty for streaming merge",
123
20
            ),
124
20
        ];
125
126
40
        if let Some((_, 
error_message1
)) =
checks.iter().find(20
|(condition, _)| *condition
)20
127
        {
128
1
            return internal_err!("{}", error_message);
129
19
        }
130
19
131
19
        // Unwrapping mandatory fields
132
19
        let schema = schema.expect("Schema cannot be empty for streaming merge");
133
19
        let metrics = metrics.expect("Metrics cannot be empty for streaming merge");
134
19
        let batch_size =
135
19
            batch_size.expect("Batch size cannot be empty for streaming merge");
136
19
        let reservation =
137
19
            reservation.expect("Reservation cannot be empty for streaming merge");
138
19
139
19
        // Special case single column comparisons with optimized cursor implementations
140
19
        if expressions.len() == 1 {
141
14
            let sort = expressions[0].clone();
142
14
            let data_type = sort.expr.data_type(schema.as_ref())
?0
;
143
0
            downcast_primitive! {
144
0
                data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation),
145
2
                DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
146
0
                DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
147
0
                DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
148
0
                DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
149
0
                _ => {}
150
            }
151
5
        }
152
153
5
        let streams = RowCursorStream::try_new(
154
5
            schema.as_ref(),
155
5
            expressions,
156
5
            streams,
157
5
            reservation.new_empty(),
158
5
        )
?0
;
159
5
        Ok(Box::pin(SortPreservingMergeStream::new(
160
5
            Box::new(streams),
161
5
            schema,
162
5
            metrics,
163
5
            batch_size,
164
5
            fetch,
165
5
            reservation,
166
5
        )))
167
20
    }
168
}