Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate-common/src/merge_arrays.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use arrow::compute::SortOptions;
19
use datafusion_common::utils::compare_rows;
20
use datafusion_common::{exec_err, ScalarValue};
21
use std::cmp::Ordering;
22
use std::collections::{BinaryHeap, VecDeque};
23
24
/// This is a wrapper struct to be able to correctly merge `ARRAY_AGG` data from
25
/// multiple partitions using `BinaryHeap`. When used inside `BinaryHeap`, this
26
/// struct returns smallest `CustomElement`, where smallest is determined by
27
/// `ordering` values (`Vec<ScalarValue>`) according to `sort_options`.
28
#[derive(Debug, PartialEq, Eq)]
29
struct CustomElement<'a> {
30
    /// Stores the partition this entry came from
31
    branch_idx: usize,
32
    /// Values to merge
33
    value: ScalarValue,
34
    // Comparison "key"
35
    ordering: Vec<ScalarValue>,
36
    /// Options defining the ordering semantics
37
    sort_options: &'a [SortOptions],
38
}
39
40
impl<'a> CustomElement<'a> {
41
0
    fn new(
42
0
        branch_idx: usize,
43
0
        value: ScalarValue,
44
0
        ordering: Vec<ScalarValue>,
45
0
        sort_options: &'a [SortOptions],
46
0
    ) -> Self {
47
0
        Self {
48
0
            branch_idx,
49
0
            value,
50
0
            ordering,
51
0
            sort_options,
52
0
        }
53
0
    }
54
55
0
    fn ordering(
56
0
        &self,
57
0
        current: &[ScalarValue],
58
0
        target: &[ScalarValue],
59
0
    ) -> datafusion_common::Result<Ordering> {
60
0
        // Calculate ordering according to `sort_options`
61
0
        compare_rows(current, target, self.sort_options)
62
0
    }
63
}
64
65
// Overwrite ordering implementation such that
66
// - `self.ordering` values are used for comparison,
67
// - When used inside `BinaryHeap` it is a min-heap.
68
impl<'a> Ord for CustomElement<'a> {
69
0
    fn cmp(&self, other: &Self) -> Ordering {
70
0
        // Compares according to custom ordering
71
0
        self.ordering(&self.ordering, &other.ordering)
72
0
            // Convert max heap to min heap
73
0
            .map(|ordering| ordering.reverse())
74
0
            // This function return error, when `self.ordering` and `other.ordering`
75
0
            // have different types (such as one is `ScalarValue::Int64`, other is `ScalarValue::Float32`)
76
0
            // Here this case won't happen, because data from each partition will have same type
77
0
            .unwrap()
78
0
    }
79
}
80
81
impl<'a> PartialOrd for CustomElement<'a> {
82
0
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
83
0
        Some(self.cmp(other))
84
0
    }
85
}
86
87
/// This functions merges `values` array (`&[Vec<ScalarValue>]`) into single array `Vec<ScalarValue>`
88
/// Merging done according to ordering values stored inside `ordering_values` (`&[Vec<Vec<ScalarValue>>]`)
89
/// Inner `Vec<ScalarValue>` in the `ordering_values` can be thought as ordering information for the
90
/// each `ScalarValue` in the `values` array.
91
/// Desired ordering specified by `sort_options` argument (Should have same size with inner `Vec<ScalarValue>`
92
/// of the `ordering_values` array).
93
///
94
/// As an example
95
/// values can be \[
96
///      \[1, 2, 3, 4, 5\],
97
///      \[1, 2, 3, 4\],
98
///      \[1, 2, 3, 4, 5, 6\],
99
/// \]
100
/// In this case we will be merging three arrays (doesn't have to be same size)
101
/// and produce a merged array with size 15 (sum of 5+4+6)
102
/// Merging will be done according to ordering at `ordering_values` vector.
103
/// As an example `ordering_values` can be [
104
///      \[(1, a), (2, b), (3, b), (4, a), (5, b) \],
105
///      \[(1, a), (2, b), (3, b), (4, a) \],
106
///      \[(1, b), (2, c), (3, d), (4, e), (5, a), (6, b) \],
107
/// ]
108
/// For each ScalarValue in the `values` we have a corresponding `Vec<ScalarValue>` (like timestamp of it)
109
/// for the example above `sort_options` will have size two, that defines ordering requirement of the merge.
110
/// Inner `Vec<ScalarValue>`s of the `ordering_values` will be compared according `sort_options` (Their sizes should match)
111
0
pub fn merge_ordered_arrays(
112
0
    // We will merge values into single `Vec<ScalarValue>`.
113
0
    values: &mut [VecDeque<ScalarValue>],
114
0
    // `values` will be merged according to `ordering_values`.
115
0
    // Inner `Vec<ScalarValue>` can be thought as ordering information for the
116
0
    // each `ScalarValue` in the values`.
117
0
    ordering_values: &mut [VecDeque<Vec<ScalarValue>>],
118
0
    // Defines according to which ordering comparisons should be done.
119
0
    sort_options: &[SortOptions],
120
0
) -> datafusion_common::Result<(Vec<ScalarValue>, Vec<Vec<ScalarValue>>)> {
121
0
    // Keep track the most recent data of each branch, in binary heap data structure.
122
0
    let mut heap = BinaryHeap::<CustomElement>::new();
123
0
124
0
    if values.len() != ordering_values.len()
125
0
        || values
126
0
            .iter()
127
0
            .zip(ordering_values.iter())
128
0
            .any(|(vals, ordering_vals)| vals.len() != ordering_vals.len())
129
    {
130
0
        return exec_err!(
131
0
            "Expects values arguments and/or ordering_values arguments to have same size"
132
0
        );
133
0
    }
134
0
    let n_branch = values.len();
135
0
    let mut merged_values = vec![];
136
0
    let mut merged_orderings = vec![];
137
    // Continue iterating the loop until consuming data of all branches.
138
    loop {
139
0
        let minimum = if let Some(minimum) = heap.pop() {
140
0
            minimum
141
        } else {
142
            // Heap is empty, fill it with the next entries from each branch.
143
0
            for branch_idx in 0..n_branch {
144
0
                if let Some(orderings) = ordering_values[branch_idx].pop_front() {
145
0
                    // Their size should be same, we can safely .unwrap here.
146
0
                    let value = values[branch_idx].pop_front().unwrap();
147
0
                    // Push the next element to the heap:
148
0
                    heap.push(CustomElement::new(
149
0
                        branch_idx,
150
0
                        value,
151
0
                        orderings,
152
0
                        sort_options,
153
0
                    ));
154
0
                }
155
                // If None, we consumed this branch, skip it.
156
            }
157
158
            // Now we have filled the heap, get the largest entry (this will be
159
            // the next element in merge).
160
0
            if let Some(minimum) = heap.pop() {
161
0
                minimum
162
            } else {
163
                // Heap is empty, this means that all indices are same with
164
                // `end_indices`. We have consumed all of the branches, merge
165
                // is completed, exit from the loop:
166
0
                break;
167
            }
168
        };
169
        let CustomElement {
170
0
            branch_idx,
171
0
            value,
172
0
            ordering,
173
0
            ..
174
0
        } = minimum;
175
0
        // Add minimum value in the heap to the result
176
0
        merged_values.push(value);
177
0
        merged_orderings.push(ordering);
178
179
        // If there is an available entry, push next entry in the most
180
        // recently consumed branch to the heap.
181
0
        if let Some(orderings) = ordering_values[branch_idx].pop_front() {
182
0
            // Their size should be same, we can safely .unwrap here.
183
0
            let value = values[branch_idx].pop_front().unwrap();
184
0
            // Push the next element to the heap:
185
0
            heap.push(CustomElement::new(
186
0
                branch_idx,
187
0
                value,
188
0
                orderings,
189
0
                sort_options,
190
0
            ));
191
0
        }
192
    }
193
194
0
    Ok((merged_values, merged_orderings))
195
0
}