/Users/andrewlamb/Software/datafusion/datafusion/expr-common/src/groups_accumulator.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Vectorized [`GroupsAccumulator`] |
19 | | |
20 | | use arrow::array::{ArrayRef, BooleanArray}; |
21 | | use datafusion_common::{not_impl_err, Result}; |
22 | | |
23 | | /// Describes how many rows should be emitted during grouping. |
24 | | #[derive(Debug, Clone, Copy)] |
25 | | pub enum EmitTo { |
26 | | /// Emit all groups |
27 | | All, |
28 | | /// Emit only the first `n` groups and shift all existing group |
29 | | /// indexes down by `n`. |
30 | | /// |
31 | | /// For example, if `n=10`, group_index `0, 1, ... 9` are emitted |
32 | | /// and group indexes `10, 11, 12, ...` become `0, 1, 2, ...`. |
33 | | First(usize), |
34 | | } |
35 | | |
36 | | impl EmitTo { |
37 | | /// Removes the number of rows from `v` required to emit the right |
38 | | /// number of rows, returning a `Vec` with elements taken, and the |
39 | | /// remaining values in `v`. |
40 | | /// |
41 | | /// This avoids copying if Self::All |
42 | 138 | pub fn take_needed<T>(&self, v: &mut Vec<T>) -> Vec<T> { |
43 | 138 | match self { |
44 | | Self::All => { |
45 | | // Take the entire vector, leave new (empty) vector |
46 | 90 | std::mem::take(v) |
47 | | } |
48 | 48 | Self::First(n) => { |
49 | 48 | // get end n+1,.. values into t |
50 | 48 | let mut t = v.split_off(*n); |
51 | 48 | // leave n+1,.. in v |
52 | 48 | std::mem::swap(v, &mut t); |
53 | 48 | t |
54 | | } |
55 | | } |
56 | 138 | } |
57 | | } |
58 | | |
59 | | /// `GroupsAccumulator` implements a single aggregate (e.g. AVG) and |
60 | | /// stores the state for *all* groups internally. |
61 | | /// |
62 | | /// Logically, a [`GroupsAccumulator`] stores a mapping from each group index to |
63 | | /// the state of the aggregate for that group. For example an implementation for |
64 | | /// `min` might look like |
65 | | /// |
66 | | /// ```text |
67 | | /// ┌─────┐ |
68 | | /// │ 0 │───────────▶ 100 |
69 | | /// ├─────┤ |
70 | | /// │ 1 │───────────▶ 200 |
71 | | /// └─────┘ |
72 | | /// ... ... |
73 | | /// ┌─────┐ |
74 | | /// │ N-2 │───────────▶ 50 |
75 | | /// ├─────┤ |
76 | | /// │ N-1 │───────────▶ 200 |
77 | | /// └─────┘ |
78 | | /// |
79 | | /// |
80 | | /// Logical group Current Min |
81 | | /// number value for that |
82 | | /// group |
83 | | /// ``` |
84 | | /// |
85 | | /// # Notes on Implementing `GroupAccumulator` |
86 | | /// |
87 | | /// All aggregates must first implement the simpler [`Accumulator`] trait, which |
88 | | /// handles state for a single group. Implementing `GroupsAccumulator` is |
89 | | /// optional and is harder to implement than `Accumulator`, but can be much |
90 | | /// faster for queries with many group values. See the [Aggregating Millions of |
91 | | /// Groups Fast blog] for more background. |
92 | | /// |
93 | | /// # Details |
94 | | /// Each group is assigned a `group_index` by the hash table and each |
95 | | /// accumulator manages the specific state, one per `group_index`. |
96 | | /// |
97 | | /// `group_index`es are contiguous (there aren't gaps), and thus it is |
98 | | /// expected that each `GroupAccumulator` will use something like `Vec<..>` |
99 | | /// to store the group states. |
100 | | /// |
101 | | /// [`Accumulator`]: crate::accumulator::Accumulator |
102 | | /// [Aggregating Millions of Groups Fast blog]: https://arrow.apache.org/blog/2023/08/05/datafusion_fast_grouping/ |
103 | | pub trait GroupsAccumulator: Send { |
104 | | /// Updates the accumulator's state from its arguments, encoded as |
105 | | /// a vector of [`ArrayRef`]s. |
106 | | /// |
107 | | /// * `values`: the input arguments to the accumulator |
108 | | /// |
109 | | /// * `group_indices`: To which groups do the rows in `values` |
110 | | /// belong, group id) |
111 | | /// |
112 | | /// * `opt_filter`: if present, only update aggregate state using |
113 | | /// `values[i]` if `opt_filter[i]` is true |
114 | | /// |
115 | | /// * `total_num_groups`: the number of groups (the largest |
116 | | /// group_index is thus `total_num_groups - 1`). |
117 | | /// |
118 | | /// Note that subsequent calls to update_batch may have larger |
119 | | /// total_num_groups as new groups are seen. |
120 | | fn update_batch( |
121 | | &mut self, |
122 | | values: &[ArrayRef], |
123 | | group_indices: &[usize], |
124 | | opt_filter: Option<&BooleanArray>, |
125 | | total_num_groups: usize, |
126 | | ) -> Result<()>; |
127 | | |
128 | | /// Returns the final aggregate value for each group as a single |
129 | | /// `RecordBatch`, resetting the internal state. |
130 | | /// |
131 | | /// The rows returned *must* be in group_index order: The value |
132 | | /// for group_index 0, followed by 1, etc. Any group_index that |
133 | | /// did not have values, should be null. |
134 | | /// |
135 | | /// For example, a `SUM` accumulator maintains a running sum for |
136 | | /// each group, and `evaluate` will produce that running sum as |
137 | | /// its output for all groups, in group_index order |
138 | | /// |
139 | | /// If `emit_to` is [`EmitTo::All`], the accumulator should |
140 | | /// return all groups and release / reset its internal state |
141 | | /// equivalent to when it was first created. |
142 | | /// |
143 | | /// If `emit_to` is [`EmitTo::First`], only the first `n` groups |
144 | | /// should be emitted and the state for those first groups |
145 | | /// removed. State for the remaining groups must be retained for |
146 | | /// future use. The group_indices on subsequent calls to |
147 | | /// `update_batch` or `merge_batch` will be shifted down by |
148 | | /// `n`. See [`EmitTo::First`] for more details. |
149 | | fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef>; |
150 | | |
151 | | /// Returns the intermediate aggregate state for this accumulator, |
152 | | /// used for multi-phase grouping, resetting its internal state. |
153 | | /// |
154 | | /// See [`Accumulator::state`] for more information on multi-phase |
155 | | /// aggregation. |
156 | | /// |
157 | | /// For example, `AVG` might return two arrays: `SUM` and `COUNT` |
158 | | /// but the `MIN` aggregate would just return a single array. |
159 | | /// |
160 | | /// Note more sophisticated internal state can be passed as |
161 | | /// single `StructArray` rather than multiple arrays. |
162 | | /// |
163 | | /// See [`Self::evaluate`] for details on the required output |
164 | | /// order and `emit_to`. |
165 | | /// |
166 | | /// [`Accumulator::state`]: crate::accumulator::Accumulator::state |
167 | | fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>; |
168 | | |
169 | | /// Merges intermediate state (the output from [`Self::state`]) |
170 | | /// into this accumulator's current state. |
171 | | /// |
172 | | /// For some aggregates (such as `SUM`), `merge_batch` is the same |
173 | | /// as `update_batch`, but for some aggregates (such as `COUNT`, |
174 | | /// where the partial counts must be summed) the operations |
175 | | /// differ. See [`Self::state`] for more details on how state is |
176 | | /// used and merged. |
177 | | /// |
178 | | /// * `values`: arrays produced from calling `state` previously to the accumulator |
179 | | /// |
180 | | /// Other arguments are the same as for [`Self::update_batch`]; |
181 | | fn merge_batch( |
182 | | &mut self, |
183 | | values: &[ArrayRef], |
184 | | group_indices: &[usize], |
185 | | opt_filter: Option<&BooleanArray>, |
186 | | total_num_groups: usize, |
187 | | ) -> Result<()>; |
188 | | |
189 | | /// Converts an input batch directly the intermediate aggregate state. |
190 | | /// |
191 | | /// This is the equivalent of treating each input row as its own group. It |
192 | | /// is invoked when the Partial phase of a multi-phase aggregation is not |
193 | | /// reducing the cardinality enough to warrant spending more effort on |
194 | | /// pre-aggregation (see `Background` section below), and switches to |
195 | | /// passing intermediate state directly on to the next aggregation phase. |
196 | | /// |
197 | | /// Examples: |
198 | | /// * `COUNT`: an array of 1s for each row in the input batch. |
199 | | /// * `SUM/MIN/MAX`: the input values themselves. |
200 | | /// |
201 | | /// # Arguments |
202 | | /// * `values`: the input arguments to the accumulator |
203 | | /// * `opt_filter`: if present, any row where `opt_filter[i]` is false should be ignored |
204 | | /// |
205 | | /// # Background |
206 | | /// |
207 | | /// In a multi-phase aggregation (see [`Accumulator::state`]), the initial |
208 | | /// Partial phase reduces the cardinality of the input data as soon as |
209 | | /// possible in the plan. |
210 | | /// |
211 | | /// This strategy is very effective for queries with a small number of |
212 | | /// groups, as most of the data is aggregated immediately and only a small |
213 | | /// amount of data must be repartitioned (see [`Accumulator::state`] for |
214 | | /// background) |
215 | | /// |
216 | | /// However, for queries with a large number of groups, the Partial phase |
217 | | /// often does not reduce the cardinality enough to warrant the memory and |
218 | | /// CPU cost of actually performing the aggregation. For such cases, the |
219 | | /// HashAggregate operator will dynamically switch to passing intermediate |
220 | | /// state directly to the next aggregation phase with minimal processing |
221 | | /// using this method. |
222 | | /// |
223 | | /// [`Accumulator::state`]: crate::accumulator::Accumulator::state |
224 | 0 | fn convert_to_state( |
225 | 0 | &self, |
226 | 0 | _values: &[ArrayRef], |
227 | 0 | _opt_filter: Option<&BooleanArray>, |
228 | 0 | ) -> Result<Vec<ArrayRef>> { |
229 | 0 | not_impl_err!("Input batch conversion to state not implemented") |
230 | 0 | } |
231 | | |
232 | | /// Returns `true` if [`Self::convert_to_state`] is implemented to support |
233 | | /// intermediate aggregate state conversion. |
234 | 0 | fn supports_convert_to_state(&self) -> bool { |
235 | 0 | false |
236 | 0 | } |
237 | | |
238 | | /// Amount of memory used to store the state of this accumulator, |
239 | | /// in bytes. |
240 | | /// |
241 | | /// This function is called once per batch, so it should be `O(n)` to |
242 | | /// compute, not `O(num_groups)` |
243 | | fn size(&self) -> usize; |
244 | | } |