Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/groups_accumulator.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Vectorized [`GroupsAccumulator`]
19
20
use arrow::array::{ArrayRef, BooleanArray};
21
use datafusion_common::{not_impl_err, Result};
22
23
/// Describes how many rows should be emitted during grouping.
24
#[derive(Debug, Clone, Copy)]
25
pub enum EmitTo {
26
    /// Emit all groups
27
    All,
28
    /// Emit only the first `n` groups and shift all existing group
29
    /// indexes down by `n`.
30
    ///
31
    /// For example, if `n=10`, group_index `0, 1, ... 9` are emitted
32
    /// and group indexes `10, 11, 12, ...` become `0, 1, 2, ...`.
33
    First(usize),
34
}
35
36
impl EmitTo {
37
    /// Removes the number of rows from `v` required to emit the right
38
    /// number of rows, returning a `Vec` with elements taken, and the
39
    /// remaining values in `v`.
40
    ///
41
    /// This avoids copying if Self::All
42
138
    pub fn take_needed<T>(&self, v: &mut Vec<T>) -> Vec<T> {
43
138
        match self {
44
            Self::All => {
45
                // Take the entire vector, leave new (empty) vector
46
90
                std::mem::take(v)
47
            }
48
48
            Self::First(n) => {
49
48
                // get end n+1,.. values into t
50
48
                let mut t = v.split_off(*n);
51
48
                // leave n+1,.. in v
52
48
                std::mem::swap(v, &mut t);
53
48
                t
54
            }
55
        }
56
138
    }
57
}
58
59
/// `GroupsAccumulator` implements a single aggregate (e.g. AVG) and
60
/// stores the state for *all* groups internally.
61
///
62
/// Logically, a [`GroupsAccumulator`] stores a mapping from each group index to
63
/// the state of the aggregate for that group. For example an implementation for
64
/// `min` might look like
65
///
66
/// ```text
67
///    ┌─────┐
68
///    │  0  │───────────▶   100
69
///    ├─────┤
70
///    │  1  │───────────▶   200
71
///    └─────┘
72
///      ...                 ...
73
///    ┌─────┐
74
///    │ N-2 │───────────▶    50
75
///    ├─────┤
76
///    │ N-1 │───────────▶   200
77
///    └─────┘
78
///
79
///
80
///  Logical group      Current Min
81
///     number          value for that
82
///                     group
83
/// ```
84
///
85
/// # Notes on Implementing `GroupAccumulator`
86
///
87
/// All aggregates must first implement the simpler [`Accumulator`] trait, which
88
/// handles state for a single group. Implementing `GroupsAccumulator` is
89
/// optional and is harder to implement than `Accumulator`, but can be much
90
/// faster for queries with many group values.  See the [Aggregating Millions of
91
/// Groups Fast blog] for more background.
92
///
93
/// # Details
94
/// Each group is assigned a `group_index` by the hash table and each
95
/// accumulator manages the specific state, one per `group_index`.
96
///
97
/// `group_index`es are contiguous (there aren't gaps), and thus it is
98
/// expected that each `GroupAccumulator` will use something like `Vec<..>`
99
/// to store the group states.
100
///
101
/// [`Accumulator`]: crate::accumulator::Accumulator
102
/// [Aggregating Millions of Groups Fast blog]: https://arrow.apache.org/blog/2023/08/05/datafusion_fast_grouping/
103
pub trait GroupsAccumulator: Send {
104
    /// Updates the accumulator's state from its arguments, encoded as
105
    /// a vector of [`ArrayRef`]s.
106
    ///
107
    /// * `values`: the input arguments to the accumulator
108
    ///
109
    /// * `group_indices`: To which groups do the rows in `values`
110
    ///   belong, group id)
111
    ///
112
    /// * `opt_filter`: if present, only update aggregate state using
113
    ///   `values[i]` if `opt_filter[i]` is true
114
    ///
115
    /// * `total_num_groups`: the number of groups (the largest
116
    ///   group_index is thus `total_num_groups - 1`).
117
    ///
118
    /// Note that subsequent calls to update_batch may have larger
119
    /// total_num_groups as new groups are seen.
120
    fn update_batch(
121
        &mut self,
122
        values: &[ArrayRef],
123
        group_indices: &[usize],
124
        opt_filter: Option<&BooleanArray>,
125
        total_num_groups: usize,
126
    ) -> Result<()>;
127
128
    /// Returns the final aggregate value for each group as a single
129
    /// `RecordBatch`, resetting the internal state.
130
    ///
131
    /// The rows returned *must* be in group_index order: The value
132
    /// for group_index 0, followed by 1, etc.  Any group_index that
133
    /// did not have values, should be null.
134
    ///
135
    /// For example, a `SUM` accumulator maintains a running sum for
136
    /// each group, and `evaluate` will produce that running sum as
137
    /// its output for all groups, in group_index order
138
    ///
139
    /// If `emit_to` is [`EmitTo::All`], the accumulator should
140
    /// return all groups and release / reset its internal state
141
    /// equivalent to when it was first created.
142
    ///
143
    /// If `emit_to` is [`EmitTo::First`], only the first `n` groups
144
    /// should be emitted and the state for those first groups
145
    /// removed. State for the remaining groups must be retained for
146
    /// future use. The group_indices on subsequent calls to
147
    /// `update_batch` or `merge_batch` will be shifted down by
148
    /// `n`. See [`EmitTo::First`] for more details.
149
    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef>;
150
151
    /// Returns the intermediate aggregate state for this accumulator,
152
    /// used for multi-phase grouping, resetting its internal state.
153
    ///
154
    /// See [`Accumulator::state`] for more information on multi-phase
155
    /// aggregation.
156
    ///
157
    /// For example, `AVG` might return two arrays: `SUM` and `COUNT`
158
    /// but the `MIN` aggregate would just return a single array.
159
    ///
160
    /// Note more sophisticated internal state can be passed as
161
    /// single `StructArray` rather than multiple arrays.
162
    ///
163
    /// See [`Self::evaluate`] for details on the required output
164
    /// order and `emit_to`.
165
    ///
166
    /// [`Accumulator::state`]: crate::accumulator::Accumulator::state
167
    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;
168
169
    /// Merges intermediate state (the output from [`Self::state`])
170
    /// into this accumulator's current state.
171
    ///
172
    /// For some aggregates (such as `SUM`), `merge_batch` is the same
173
    /// as `update_batch`, but for some aggregates (such as `COUNT`,
174
    /// where the partial counts must be summed) the operations
175
    /// differ. See [`Self::state`] for more details on how state is
176
    /// used and merged.
177
    ///
178
    /// * `values`: arrays produced from calling `state` previously to the accumulator
179
    ///
180
    /// Other arguments are the same as for [`Self::update_batch`];
181
    fn merge_batch(
182
        &mut self,
183
        values: &[ArrayRef],
184
        group_indices: &[usize],
185
        opt_filter: Option<&BooleanArray>,
186
        total_num_groups: usize,
187
    ) -> Result<()>;
188
189
    /// Converts an input batch directly the intermediate aggregate state.
190
    ///
191
    /// This is the equivalent of treating each input row as its own group. It
192
    /// is invoked when the Partial phase of a multi-phase aggregation is not
193
    /// reducing the cardinality enough to warrant spending more effort on
194
    /// pre-aggregation (see `Background` section below), and switches to
195
    /// passing intermediate state directly on to the next aggregation phase.
196
    ///
197
    /// Examples:
198
    /// * `COUNT`: an array of 1s for each row in the input batch.
199
    /// * `SUM/MIN/MAX`: the input values themselves.
200
    ///
201
    /// # Arguments
202
    /// * `values`: the input arguments to the accumulator
203
    /// * `opt_filter`: if present, any row where `opt_filter[i]` is false should be ignored
204
    ///
205
    /// # Background
206
    ///
207
    /// In a multi-phase aggregation (see [`Accumulator::state`]), the initial
208
    /// Partial phase reduces the cardinality of the input data as soon as
209
    /// possible in the plan.
210
    ///
211
    /// This strategy is very effective for queries with a small number of
212
    /// groups, as most of the data is aggregated immediately and only a small
213
    /// amount of data must be repartitioned (see [`Accumulator::state`] for
214
    /// background)
215
    ///
216
    /// However, for queries with a large number of groups, the Partial phase
217
    /// often does not reduce the cardinality enough to warrant the memory and
218
    /// CPU cost of actually performing the aggregation. For such cases, the
219
    /// HashAggregate operator will dynamically switch to passing intermediate
220
    /// state directly to the next aggregation phase with minimal processing
221
    /// using this method.
222
    ///
223
    /// [`Accumulator::state`]: crate::accumulator::Accumulator::state
224
0
    fn convert_to_state(
225
0
        &self,
226
0
        _values: &[ArrayRef],
227
0
        _opt_filter: Option<&BooleanArray>,
228
0
    ) -> Result<Vec<ArrayRef>> {
229
0
        not_impl_err!("Input batch conversion to state not implemented")
230
0
    }
231
232
    /// Returns `true` if [`Self::convert_to_state`] is implemented to support
233
    /// intermediate aggregate state conversion.
234
0
    fn supports_convert_to_state(&self) -> bool {
235
0
        false
236
0
    }
237
238
    /// Amount of memory used to store the state of this accumulator,
239
    /// in bytes.
240
    ///
241
    /// This function is called once per batch, so it should be `O(n)` to
242
    /// compute, not `O(num_groups)`
243
    fn size(&self) -> usize;
244
}