/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/accumulator.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Accumulator module contains the trait definition for aggregation function's accumulators. |
19 | | |
20 | | use arrow::array::ArrayRef; |
21 | | use datafusion_common::{internal_err, Result, ScalarValue}; |
22 | | use std::fmt::Debug; |
23 | | |
24 | | /// Tracks an aggregate function's state. |
25 | | /// |
26 | | /// `Accumulator`s are stateful objects that implement a single group. They |
27 | | /// aggregate values from multiple rows together into a final output aggregate. |
28 | | /// |
29 | | /// [`GroupsAccumulator]` is an additional more performant (but also complex) API |
30 | | /// that manages state for multiple groups at once. |
31 | | /// |
32 | | /// An accumulator knows how to: |
33 | | /// * update its state from inputs via [`update_batch`] |
34 | | /// |
35 | | /// * compute the final value from its internal state via [`evaluate`] |
36 | | /// |
37 | | /// * retract an update to its state from given inputs via |
38 | | /// [`retract_batch`] (when used as a window aggregate [window |
39 | | /// function]) |
40 | | /// |
41 | | /// * convert its internal state to a vector of aggregate values via |
42 | | /// [`state`] and combine the state from multiple accumulators' |
43 | | /// via [`merge_batch`], as part of efficient multi-phase grouping. |
44 | | /// |
45 | | /// [`GroupsAccumulator`]: crate::GroupsAccumulator |
46 | | /// [`update_batch`]: Self::update_batch |
47 | | /// [`retract_batch`]: Self::retract_batch |
48 | | /// [`state`]: Self::state |
49 | | /// [`evaluate`]: Self::evaluate |
50 | | /// [`merge_batch`]: Self::merge_batch |
51 | | /// [window function]: https://en.wikipedia.org/wiki/Window_function_(SQL) |
52 | | pub trait Accumulator: Send + Sync + Debug { |
53 | | /// Updates the accumulator's state from its input. |
54 | | /// |
55 | | /// `values` contains the arguments to this aggregate function. |
56 | | /// |
57 | | /// For example, the `SUM` accumulator maintains a running sum, |
58 | | /// and `update_batch` adds each of the input values to the |
59 | | /// running sum. |
60 | | fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()>; |
61 | | |
62 | | /// Returns the final aggregate value, consuming the internal state. |
63 | | /// |
64 | | /// For example, the `SUM` accumulator maintains a running sum, |
65 | | /// and `evaluate` will produce that running sum as its output. |
66 | | /// |
67 | | /// This function should not be called twice, otherwise it will |
68 | | /// result in potentially non-deterministic behavior. |
69 | | /// |
70 | | /// This function gets `&mut self` to allow for the accumulator to build |
71 | | /// arrow compatible internal state that can be returned without copying |
72 | | /// when possible (for example distinct strings) |
73 | | fn evaluate(&mut self) -> Result<ScalarValue>; |
74 | | |
75 | | /// Returns the allocated size required for this accumulator, in |
76 | | /// bytes, including `Self`. |
77 | | /// |
78 | | /// This value is used to calculate the memory used during |
79 | | /// execution so DataFusion can stay within its allotted limit. |
80 | | /// |
81 | | /// "Allocated" means that for internal containers such as `Vec`, |
82 | | /// the `capacity` should be used not the `len`. |
83 | | fn size(&self) -> usize; |
84 | | |
85 | | /// Returns the intermediate state of the accumulator, consuming the |
86 | | /// intermediate state. |
87 | | /// |
88 | | /// This function should not be called twice, otherwise it will |
89 | | /// result in potentially non-deterministic behavior. |
90 | | /// |
91 | | /// This function gets `&mut self` to allow for the accumulator to build |
92 | | /// arrow compatible internal state that can be returned without copying |
93 | | /// when possible (for example distinct strings). |
94 | | /// |
95 | | /// Intermediate state is used for "multi-phase" grouping in |
96 | | /// DataFusion, where an aggregate is computed in parallel with |
97 | | /// multiple `Accumulator` instances, as described below: |
98 | | /// |
99 | | /// # MultiPhase Grouping |
100 | | /// |
101 | | /// ```text |
102 | | /// ▲ |
103 | | /// │ evaluate() is called to |
104 | | /// │ produce the final aggregate |
105 | | /// │ value per group |
106 | | /// │ |
107 | | /// ┌─────────────────────────┐ |
108 | | /// │GroupBy │ |
109 | | /// │(AggregateMode::Final) │ state() is called for each |
110 | | /// │ │ group and the resulting |
111 | | /// └─────────────────────────┘ RecordBatches passed to the |
112 | | /// ▲ |
113 | | /// │ |
114 | | /// ┌────────────────┴───────────────┐ |
115 | | /// │ │ |
116 | | /// │ │ |
117 | | /// ┌─────────────────────────┐ ┌─────────────────────────┐ |
118 | | /// │ GroubyBy │ │ GroubyBy │ |
119 | | /// │(AggregateMode::Partial) │ │(AggregateMode::Partial) │ |
120 | | /// └─────────────────────────┘ └─────────────────────────┘ |
121 | | /// ▲ ▲ |
122 | | /// │ │ update_batch() is called for |
123 | | /// │ │ each input RecordBatch |
124 | | /// .─────────. .─────────. |
125 | | /// ,─' '─. ,─' '─. |
126 | | /// ; Input : ; Input : |
127 | | /// : Partition 0 ; : Partition 1 ; |
128 | | /// ╲ ╱ ╲ ╱ |
129 | | /// '─. ,─' '─. ,─' |
130 | | /// `───────' `───────' |
131 | | /// ``` |
132 | | /// |
133 | | /// The partial state is serialized as `Arrays` and then combined |
134 | | /// with other partial states from different instances of this |
135 | | /// Accumulator (that ran on different partitions, for example). |
136 | | /// |
137 | | /// The state can be and often is a different type than the output |
138 | | /// type of the [`Accumulator`] and needs different merge |
139 | | /// operations (for example, the partial state for `COUNT` needs |
140 | | /// to be summed together) |
141 | | /// |
142 | | /// Some accumulators can return multiple values for their |
143 | | /// intermediate states. For example average, tracks `sum` and |
144 | | /// `n`, and this function should return |
145 | | /// a vector of two values, sum and n. |
146 | | /// |
147 | | /// Note that [`ScalarValue::List`] can be used to pass multiple |
148 | | /// values if the number of intermediate values is not known at |
149 | | /// planning time (e.g. for `MEDIAN`) |
150 | | /// |
151 | | /// # Multi-phase repartitioned Grouping |
152 | | /// |
153 | | /// Many multi-phase grouping plans contain a Repartition operation |
154 | | /// as well as shown below: |
155 | | /// |
156 | | /// ```text |
157 | | /// ▲ ▲ |
158 | | /// │ │ |
159 | | /// │ │ |
160 | | /// │ │ |
161 | | /// │ │ |
162 | | /// │ │ |
163 | | /// ┌───────────────────────┐ ┌───────────────────────┐ 4. Each AggregateMode::Final |
164 | | /// │GroupBy │ │GroupBy │ GroupBy has an entry for its |
165 | | /// │(AggregateMode::Final) │ │(AggregateMode::Final) │ subset of groups (in this case |
166 | | /// │ │ │ │ that means half the entries) |
167 | | /// └───────────────────────┘ └───────────────────────┘ |
168 | | /// ▲ ▲ |
169 | | /// │ │ |
170 | | /// └─────────────┬────────────┘ |
171 | | /// │ |
172 | | /// │ |
173 | | /// │ |
174 | | /// ┌─────────────────────────┐ 3. Repartitioning by hash(group |
175 | | /// │ Repartition │ keys) ensures that each distinct |
176 | | /// │ HASH(x) │ group key now appears in exactly |
177 | | /// └─────────────────────────┘ one partition |
178 | | /// ▲ |
179 | | /// │ |
180 | | /// ┌───────────────┴─────────────┐ |
181 | | /// │ │ |
182 | | /// │ │ |
183 | | /// ┌─────────────────────────┐ ┌──────────────────────────┐ 2. Each AggregateMode::Partial |
184 | | /// │ GroubyBy │ │ GroubyBy │ GroupBy has an entry for *all* |
185 | | /// │(AggregateMode::Partial) │ │ (AggregateMode::Partial) │ the groups |
186 | | /// └─────────────────────────┘ └──────────────────────────┘ |
187 | | /// ▲ ▲ |
188 | | /// │ │ |
189 | | /// │ │ |
190 | | /// .─────────. .─────────. |
191 | | /// ,─' '─. ,─' '─. |
192 | | /// ; Input : ; Input : 1. Since input data is |
193 | | /// : Partition 0 ; : Partition 1 ; arbitrarily or RoundRobin |
194 | | /// ╲ ╱ ╲ ╱ distributed, each partition |
195 | | /// '─. ,─' '─. ,─' likely has all distinct |
196 | | /// `───────' `───────' |
197 | | /// ``` |
198 | | /// |
199 | | /// This structure is used so that the `AggregateMode::Partial` accumulators |
200 | | /// reduces the cardinality of the input as soon as possible. Typically, |
201 | | /// each partial accumulator sees all groups in the input as the group keys |
202 | | /// are evenly distributed across the input. |
203 | | /// |
204 | | /// The final output is computed by repartitioning the result of |
205 | | /// [`Self::state`] from each Partial aggregate and `hash(group keys)` so |
206 | | /// that each distinct group key appears in exactly one of the |
207 | | /// `AggregateMode::Final` GroupBy nodes. The output of the final nodes are |
208 | | /// then unioned together to produce the overall final output. |
209 | | /// |
210 | | /// Here is an example that shows the distribution of groups in the |
211 | | /// different phases |
212 | | /// |
213 | | /// ```text |
214 | | /// ┌─────┐ ┌─────┐ |
215 | | /// │ 1 │ │ 3 │ |
216 | | /// ├─────┤ ├─────┤ |
217 | | /// │ 2 │ │ 4 │ After repartitioning by |
218 | | /// └─────┘ └─────┘ hash(group keys), each distinct |
219 | | /// ┌─────┐ ┌─────┐ group key now appears in exactly |
220 | | /// │ 1 │ │ 3 │ one partition |
221 | | /// ├─────┤ ├─────┤ |
222 | | /// │ 2 │ │ 4 │ |
223 | | /// └─────┘ └─────┘ |
224 | | /// |
225 | | /// |
226 | | /// ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ |
227 | | /// |
228 | | /// ┌─────┐ ┌─────┐ |
229 | | /// │ 2 │ │ 2 │ |
230 | | /// ├─────┤ ├─────┤ |
231 | | /// │ 1 │ │ 2 │ |
232 | | /// ├─────┤ ├─────┤ |
233 | | /// │ 3 │ │ 3 │ |
234 | | /// ├─────┤ ├─────┤ |
235 | | /// │ 4 │ │ 1 │ |
236 | | /// └─────┘ └─────┘ Input data is arbitrarily or |
237 | | /// ... ... RoundRobin distributed, each |
238 | | /// ┌─────┐ ┌─────┐ partition likely has all |
239 | | /// │ 1 │ │ 4 │ distinct group keys |
240 | | /// ├─────┤ ├─────┤ |
241 | | /// │ 4 │ │ 3 │ |
242 | | /// ├─────┤ ├─────┤ |
243 | | /// │ 1 │ │ 1 │ |
244 | | /// ├─────┤ ├─────┤ |
245 | | /// │ 4 │ │ 3 │ |
246 | | /// └─────┘ └─────┘ |
247 | | /// |
248 | | /// group values group values |
249 | | /// in partition 0 in partition 1 |
250 | | /// ``` |
251 | | fn state(&mut self) -> Result<Vec<ScalarValue>>; |
252 | | |
253 | | /// Updates the accumulator's state from an `Array` containing one |
254 | | /// or more intermediate values. |
255 | | /// |
256 | | /// For some aggregates (such as `SUM`), merge_batch is the same |
257 | | /// as `update_batch`, but for some aggregrates (such as `COUNT`) |
258 | | /// the operations differ. See [`Self::state`] for more details on how |
259 | | /// state is used and merged. |
260 | | /// |
261 | | /// The `states` array passed was formed by concatenating the |
262 | | /// results of calling [`Self::state`] on zero or more other |
263 | | /// `Accumulator` instances. |
264 | | fn merge_batch(&mut self, states: &[ArrayRef]) -> Result<()>; |
265 | | |
266 | | /// Retracts (removed) an update (caused by the given inputs) to |
267 | | /// accumulator's state. |
268 | | /// |
269 | | /// This is the inverse operation of [`Self::update_batch`] and is used |
270 | | /// to incrementally calculate window aggregates where the `OVER` |
271 | | /// clause defines a bounded window. |
272 | | /// |
273 | | /// # Example |
274 | | /// |
275 | | /// For example, given the following input partition |
276 | | /// |
277 | | /// ```text |
278 | | /// │ current │ |
279 | | /// window |
280 | | /// │ │ |
281 | | /// ┌────┬────┬────┬────┬────┬────┬────┬────┬────┐ |
282 | | /// Input │ A │ B │ C │ D │ E │ F │ G │ H │ I │ |
283 | | /// partition └────┴────┴────┴────┼────┴────┴────┴────┼────┘ |
284 | | /// |
285 | | /// │ next │ |
286 | | /// window |
287 | | /// ``` |
288 | | /// |
289 | | /// First, [`Self::evaluate`] will be called to produce the output |
290 | | /// for the current window. |
291 | | /// |
292 | | /// Then, to advance to the next window: |
293 | | /// |
294 | | /// First, [`Self::retract_batch`] will be called with the values |
295 | | /// that are leaving the window, `[B, C, D]` and then |
296 | | /// [`Self::update_batch`] will be called with the values that are |
297 | | /// entering the window, `[F, G, H]`. |
298 | 0 | fn retract_batch(&mut self, _values: &[ArrayRef]) -> Result<()> { |
299 | 0 | // TODO add retract for all accumulators |
300 | 0 | internal_err!( |
301 | 0 | "Retract should be implemented for aggregate functions when used with custom window frame queries" |
302 | 0 | ) |
303 | 0 | } |
304 | | |
305 | | /// Does the accumulator support incrementally updating its value |
306 | | /// by *removing* values. |
307 | | /// |
308 | | /// If this function returns true, [`Self::retract_batch`] will be |
309 | | /// called for sliding window functions such as queries with an |
310 | | /// `OVER (ROWS BETWEEN 1 PRECEDING AND 2 FOLLOWING)` |
311 | 0 | fn supports_retract_batch(&self) -> bool { |
312 | 0 | false |
313 | 0 | } |
314 | | } |