Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/accumulator.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Accumulator module contains the trait definition for aggregation function's accumulators.
19
20
use arrow::array::ArrayRef;
21
use datafusion_common::{internal_err, Result, ScalarValue};
22
use std::fmt::Debug;
23
24
/// Tracks an aggregate function's state.
25
///
26
/// `Accumulator`s are stateful objects that implement a single group. They
27
/// aggregate values from multiple rows together into a final output aggregate.
28
///
29
/// [`GroupsAccumulator]` is an additional more performant (but also complex) API
30
/// that manages state for multiple groups at once.
31
///
32
/// An accumulator knows how to:
33
/// * update its state from inputs via [`update_batch`]
34
///
35
/// * compute the final value from its internal state via [`evaluate`]
36
///
37
/// * retract an update to its state from given inputs via
38
///   [`retract_batch`] (when used as a window aggregate [window
39
///   function])
40
///
41
/// * convert its internal state to a vector of aggregate values via
42
///   [`state`] and combine the state from multiple accumulators'
43
///   via [`merge_batch`], as part of efficient multi-phase grouping.
44
///
45
/// [`GroupsAccumulator`]: crate::GroupsAccumulator
46
/// [`update_batch`]: Self::update_batch
47
/// [`retract_batch`]: Self::retract_batch
48
/// [`state`]: Self::state
49
/// [`evaluate`]: Self::evaluate
50
/// [`merge_batch`]: Self::merge_batch
51
/// [window function]: https://en.wikipedia.org/wiki/Window_function_(SQL)
52
pub trait Accumulator: Send + Sync + Debug {
53
    /// Updates the accumulator's state from its input.
54
    ///
55
    /// `values` contains the arguments to this aggregate function.
56
    ///
57
    /// For example, the `SUM` accumulator maintains a running sum,
58
    /// and `update_batch` adds each of the input values to the
59
    /// running sum.
60
    fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()>;
61
62
    /// Returns the final aggregate value, consuming the internal state.
63
    ///
64
    /// For example, the `SUM` accumulator maintains a running sum,
65
    /// and `evaluate` will produce that running sum as its output.
66
    ///
67
    /// This function should not be called twice, otherwise it will
68
    /// result in potentially non-deterministic behavior.
69
    ///
70
    /// This function gets `&mut self` to allow for the accumulator to build
71
    /// arrow compatible internal state that can be returned without copying
72
    /// when possible (for example distinct strings)
73
    fn evaluate(&mut self) -> Result<ScalarValue>;
74
75
    /// Returns the allocated size required for this accumulator, in
76
    /// bytes, including `Self`.
77
    ///
78
    /// This value is used to calculate the memory used during
79
    /// execution so DataFusion can stay within its allotted limit.
80
    ///
81
    /// "Allocated" means that for internal containers such as `Vec`,
82
    /// the `capacity` should be used not the `len`.
83
    fn size(&self) -> usize;
84
85
    /// Returns the intermediate state of the accumulator, consuming the
86
    /// intermediate state.
87
    ///
88
    /// This function should not be called twice, otherwise it will
89
    /// result in potentially non-deterministic behavior.
90
    ///
91
    /// This function gets `&mut self` to allow for the accumulator to build
92
    /// arrow compatible internal state that can be returned without copying
93
    /// when possible (for example distinct strings).
94
    ///
95
    /// Intermediate state is used for "multi-phase" grouping in
96
    /// DataFusion, where an aggregate is computed in parallel with
97
    /// multiple `Accumulator` instances, as described below:
98
    ///
99
    /// # MultiPhase Grouping
100
    ///
101
    /// ```text
102
    ///                               ▲
103
    ///                               │                   evaluate() is called to
104
    ///                               │                   produce the final aggregate
105
    ///                               │                   value per group
106
    ///                               │
107
    ///                  ┌─────────────────────────┐
108
    ///                  │GroupBy                  │
109
    ///                  │(AggregateMode::Final)   │      state() is called for each
110
    ///                  │                         │      group and the resulting
111
    ///                  └─────────────────────────┘      RecordBatches passed to the
112
    ///                               ▲
113
    ///                               │
114
    ///              ┌────────────────┴───────────────┐
115
    ///              │                                │
116
    ///              │                                │
117
    /// ┌─────────────────────────┐      ┌─────────────────────────┐
118
    /// │        GroubyBy         │      │        GroubyBy         │
119
    /// │(AggregateMode::Partial) │      │(AggregateMode::Partial) │
120
    /// └─────────────────────────┘      └─────────────────────────┘
121
    ///              ▲                                ▲
122
    ///              │                                │    update_batch() is called for
123
    ///              │                                │    each input RecordBatch
124
    ///         .─────────.                      .─────────.
125
    ///      ,─'           '─.                ,─'           '─.
126
    ///     ;      Input      :              ;      Input      :
127
    ///     :   Partition 0   ;              :   Partition 1   ;
128
    ///      ╲               ╱                ╲               ╱
129
    ///       '─.         ,─'                  '─.         ,─'
130
    ///          `───────'                        `───────'
131
    /// ```
132
    ///
133
    /// The partial state is serialized as `Arrays` and then combined
134
    /// with other partial states from different instances of this
135
    /// Accumulator (that ran on different partitions, for example).
136
    ///
137
    /// The state can be and often is a different type than the output
138
    /// type of the [`Accumulator`] and needs different merge
139
    /// operations (for example, the partial state for `COUNT` needs
140
    /// to be summed together)
141
    ///
142
    /// Some accumulators can return multiple values for their
143
    /// intermediate states. For example average, tracks `sum` and
144
    ///  `n`, and this function should return
145
    /// a vector of two values, sum and n.
146
    ///
147
    /// Note that [`ScalarValue::List`] can be used to pass multiple
148
    /// values if the number of intermediate values is not known at
149
    /// planning time (e.g. for `MEDIAN`)
150
    ///
151
    /// # Multi-phase repartitioned Grouping
152
    ///
153
    /// Many multi-phase grouping plans contain a Repartition operation
154
    /// as well as shown below:
155
    ///
156
    /// ```text
157
    ///                ▲                          ▲
158
    ///                │                          │
159
    ///                │                          │
160
    ///                │                          │
161
    ///                │                          │
162
    ///                │                          │
163
    ///    ┌───────────────────────┐  ┌───────────────────────┐       4. Each AggregateMode::Final
164
    ///    │GroupBy                │  │GroupBy                │       GroupBy has an entry for its
165
    ///    │(AggregateMode::Final) │  │(AggregateMode::Final) │       subset of groups (in this case
166
    ///    │                       │  │                       │       that means half the entries)
167
    ///    └───────────────────────┘  └───────────────────────┘
168
    ///                ▲                          ▲
169
    ///                │                          │
170
    ///                └─────────────┬────────────┘
171
    ///                              │
172
    ///                              │
173
    ///                              │
174
    ///                 ┌─────────────────────────┐                   3. Repartitioning by hash(group
175
    ///                 │       Repartition       │                   keys) ensures that each distinct
176
    ///                 │         HASH(x)         │                   group key now appears in exactly
177
    ///                 └─────────────────────────┘                   one partition
178
    ///                              ▲
179
    ///                              │
180
    ///              ┌───────────────┴─────────────┐
181
    ///              │                             │
182
    ///              │                             │
183
    /// ┌─────────────────────────┐  ┌──────────────────────────┐     2. Each AggregateMode::Partial
184
    /// │        GroubyBy         │  │         GroubyBy         │     GroupBy has an entry for *all*
185
    /// │(AggregateMode::Partial) │  │ (AggregateMode::Partial) │     the groups
186
    /// └─────────────────────────┘  └──────────────────────────┘
187
    ///              ▲                             ▲
188
    ///              │                             │
189
    ///              │                             │
190
    ///         .─────────.                   .─────────.
191
    ///      ,─'           '─.             ,─'           '─.
192
    ///     ;      Input      :           ;      Input      :         1. Since input data is
193
    ///     :   Partition 0   ;           :   Partition 1   ;         arbitrarily or RoundRobin
194
    ///      ╲               ╱             ╲               ╱          distributed, each partition
195
    ///       '─.         ,─'               '─.         ,─'           likely has all distinct
196
    ///          `───────'                     `───────'
197
    /// ```
198
    ///
199
    /// This structure is used so that the `AggregateMode::Partial` accumulators
200
    /// reduces the cardinality of the input as soon as possible. Typically,
201
    /// each partial accumulator sees all groups in the input as the group keys
202
    /// are evenly distributed across the input.
203
    ///
204
    /// The final output is computed by repartitioning the result of
205
    /// [`Self::state`] from each Partial aggregate and `hash(group keys)` so
206
    /// that each distinct group key appears in exactly one of the
207
    /// `AggregateMode::Final` GroupBy nodes. The output of the final nodes are
208
    /// then unioned together to produce the overall final output.
209
    ///
210
    /// Here is an example that shows the distribution of groups in the
211
    /// different phases
212
    ///
213
    /// ```text
214
    ///               ┌─────┐                ┌─────┐
215
    ///               │  1  │                │  3  │
216
    ///               ├─────┤                ├─────┤
217
    ///               │  2  │                │  4  │                After repartitioning by
218
    ///               └─────┘                └─────┘                hash(group keys), each distinct
219
    ///               ┌─────┐                ┌─────┐                group key now appears in exactly
220
    ///               │  1  │                │  3  │                one partition
221
    ///               ├─────┤                ├─────┤
222
    ///               │  2  │                │  4  │
223
    ///               └─────┘                └─────┘
224
    ///
225
    ///
226
    /// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
227
    ///
228
    ///               ┌─────┐                ┌─────┐
229
    ///               │  2  │                │  2  │
230
    ///               ├─────┤                ├─────┤
231
    ///               │  1  │                │  2  │
232
    ///               ├─────┤                ├─────┤
233
    ///               │  3  │                │  3  │
234
    ///               ├─────┤                ├─────┤
235
    ///               │  4  │                │  1  │
236
    ///               └─────┘                └─────┘                Input data is arbitrarily or
237
    ///                 ...                    ...                  RoundRobin distributed, each
238
    ///               ┌─────┐                ┌─────┐                partition likely has all
239
    ///               │  1  │                │  4  │                distinct group keys
240
    ///               ├─────┤                ├─────┤
241
    ///               │  4  │                │  3  │
242
    ///               ├─────┤                ├─────┤
243
    ///               │  1  │                │  1  │
244
    ///               ├─────┤                ├─────┤
245
    ///               │  4  │                │  3  │
246
    ///               └─────┘                └─────┘
247
    ///
248
    ///           group values           group values
249
    ///           in partition 0         in partition 1
250
    /// ```
251
    fn state(&mut self) -> Result<Vec<ScalarValue>>;
252
253
    /// Updates the accumulator's state from an `Array` containing one
254
    /// or more intermediate values.
255
    ///
256
    /// For some aggregates (such as `SUM`), merge_batch is the same
257
    /// as `update_batch`, but for some aggregrates (such as `COUNT`)
258
    /// the operations differ. See [`Self::state`] for more details on how
259
    /// state is used and merged.
260
    ///
261
    /// The `states` array passed was formed by concatenating the
262
    /// results of calling [`Self::state`] on zero or more other
263
    /// `Accumulator` instances.
264
    fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()>;
265
266
    /// Retracts (removed) an update (caused by the given inputs) to
267
    /// accumulator's state.
268
    ///
269
    /// This is the inverse operation of [`Self::update_batch`] and is used
270
    /// to incrementally calculate window aggregates where the `OVER`
271
    /// clause defines a bounded window.
272
    ///
273
    /// # Example
274
    ///
275
    /// For example, given the following input partition
276
    ///
277
    /// ```text
278
    ///                     │      current      │
279
    ///                            window
280
    ///                     │                   │
281
    ///                ┌────┬────┬────┬────┬────┬────┬────┬────┬────┐
282
    ///     Input      │ A  │ B  │ C  │ D  │ E  │ F  │ G  │ H  │ I  │
283
    ///   partition    └────┴────┴────┴────┼────┴────┴────┴────┼────┘
284
    ///
285
    ///                                    │         next      │
286
    ///                                             window
287
    /// ```
288
    ///
289
    /// First, [`Self::evaluate`] will be called to produce the output
290
    /// for the current window.
291
    ///
292
    /// Then, to advance to the next window:
293
    ///
294
    /// First, [`Self::retract_batch`] will be called with the values
295
    /// that are leaving the window, `[B, C, D]` and then
296
    /// [`Self::update_batch`] will be called with the values that are
297
    /// entering the window, `[F, G, H]`.
298
0
    fn retract_batch(&mut self, _values: &[ArrayRef]) -> Result<()> {
299
0
        // TODO add retract for all accumulators
300
0
        internal_err!(
301
0
            "Retract should be implemented for aggregate functions when used with custom window frame queries"
302
0
        )
303
0
    }
304
305
    /// Does the accumulator support incrementally updating its value
306
    /// by *removing* values.
307
    ///
308
    /// If this function returns true, [`Self::retract_batch`] will be
309
    /// called for sliding window functions such as queries with an
310
    /// `OVER (ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING)`
311
0
    fn supports_retract_batch(&self) -> bool {
312
0
        false
313
0
    }
314
}