/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! Utilities for implementing GroupsAccumulator |
19 | | //! Adapter that makes [`GroupsAccumulator`] out of [`Accumulator`] |
20 | | |
21 | | pub mod accumulate; |
22 | | pub mod bool_op; |
23 | | pub mod nulls; |
24 | | pub mod prim_op; |
25 | | |
26 | | use arrow::{ |
27 | | array::{ArrayRef, AsArray, BooleanArray, PrimitiveArray}, |
28 | | compute, |
29 | | datatypes::UInt32Type, |
30 | | }; |
31 | | use datafusion_common::{ |
32 | | arrow_datafusion_err, utils::take_arrays, DataFusionError, Result, ScalarValue, |
33 | | }; |
34 | | use datafusion_expr_common::accumulator::Accumulator; |
35 | | use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator}; |
36 | | |
37 | | /// An adapter that implements [`GroupsAccumulator`] for any [`Accumulator`] |
38 | | /// |
39 | | /// While [`Accumulator`] are simpler to implement and can support |
40 | | /// more general calculations (like retractable window functions), |
41 | | /// they are not as fast as a specialized `GroupsAccumulator`. This |
42 | | /// interface bridges the gap so the group by operator only operates |
43 | | /// in terms of [`Accumulator`]. |
44 | | /// |
45 | | /// Internally, this adapter creates a new [`Accumulator`] for each group which |
46 | | /// stores the state for that group. This both requires an allocation for each |
47 | | /// Accumulator, internal indices, as well as whatever internal allocations the |
48 | | /// Accumulator itself requires. |
49 | | /// |
50 | | /// For example, a `MinAccumulator` that computes the minimum string value with |
51 | | /// a [`ScalarValue::Utf8`]. That will require at least two allocations per group |
52 | | /// (one for the `MinAccumulator` and one for the `ScalarValue::Utf8`). |
53 | | /// |
54 | | /// ```text |
55 | | /// ┌─────────────────────────────────┐ |
56 | | /// │MinAccumulator { │ |
57 | | /// ┌─────▶│ min: ScalarValue::Utf8("A") │───────┐ |
58 | | /// │ │} │ │ |
59 | | /// │ └─────────────────────────────────┘ └───────▶ "A" |
60 | | /// ┌─────┐ │ ┌─────────────────────────────────┐ |
61 | | /// │ 0 │─────┘ │MinAccumulator { │ |
62 | | /// ├─────┤ ┌─────▶│ min: ScalarValue::Utf8("Z") │───────────────▶ "Z" |
63 | | /// │ 1 │─────┘ │} │ |
64 | | /// └─────┘ └─────────────────────────────────┘ ... |
65 | | /// ... ... |
66 | | /// ┌─────┐ ┌────────────────────────────────┐ |
67 | | /// │ N-2 │ │MinAccumulator { │ |
68 | | /// ├─────┤ │ min: ScalarValue::Utf8("A") │────────────────▶ "A" |
69 | | /// │ N-1 │─────┐ │} │ |
70 | | /// └─────┘ │ └────────────────────────────────┘ |
71 | | /// │ ┌────────────────────────────────┐ ┌───────▶ "Q" |
72 | | /// │ │MinAccumulator { │ │ |
73 | | /// └─────▶│ min: ScalarValue::Utf8("Q") │────────┘ |
74 | | /// │} │ |
75 | | /// └────────────────────────────────┘ |
76 | | /// |
77 | | /// |
78 | | /// Logical group Current Min/Max value for that group stored |
79 | | /// number as a ScalarValue which points to an |
80 | | /// indivdually allocated String |
81 | | /// |
82 | | ///``` |
83 | | /// |
84 | | /// # Optimizations |
85 | | /// |
86 | | /// The adapter minimizes the number of calls to [`Accumulator::update_batch`] |
87 | | /// by first collecting the input rows for each group into a contiguous array |
88 | | /// using [`compute::take`] |
89 | | /// |
90 | | pub struct GroupsAccumulatorAdapter { |
91 | | factory: Box<dyn Fn() -> Result<Box<dyn Accumulator>> + Send>, |
92 | | |
93 | | /// state for each group, stored in group_index order |
94 | | states: Vec<AccumulatorState>, |
95 | | |
96 | | /// Current memory usage, in bytes. |
97 | | /// |
98 | | /// Note this is incrementally updated with deltas to avoid the |
99 | | /// call to size() being a bottleneck. We saw size() being a |
100 | | /// bottleneck in earlier implementations when there were many |
101 | | /// distinct groups. |
102 | | allocation_bytes: usize, |
103 | | } |
104 | | |
105 | | struct AccumulatorState { |
106 | | /// [`Accumulator`] that stores the per-group state |
107 | | accumulator: Box<dyn Accumulator>, |
108 | | |
109 | | /// scratch space: indexes in the input array that will be fed to |
110 | | /// this accumulator. Stores indexes as `u32` to match the arrow |
111 | | /// `take` kernel input. |
112 | | indices: Vec<u32>, |
113 | | } |
114 | | |
115 | | impl AccumulatorState { |
116 | 130 | fn new(accumulator: Box<dyn Accumulator>) -> Self { |
117 | 130 | Self { |
118 | 130 | accumulator, |
119 | 130 | indices: vec![], |
120 | 130 | } |
121 | 130 | } |
122 | | |
123 | | /// Returns the amount of memory taken by this structure and its accumulator |
124 | 584 | fn size(&self) -> usize { |
125 | 584 | self.accumulator.size() |
126 | 584 | + std::mem::size_of_val(self) |
127 | 584 | + self.indices.allocated_size() |
128 | 584 | } |
129 | | } |
130 | | |
131 | | impl GroupsAccumulatorAdapter { |
132 | | /// Create a new adapter that will create a new [`Accumulator`] |
133 | | /// for each group, using the specified factory function |
134 | 40 | pub fn new<F>(factory: F) -> Self |
135 | 40 | where |
136 | 40 | F: Fn() -> Result<Box<dyn Accumulator>> + Send + 'static, |
137 | 40 | { |
138 | 40 | Self { |
139 | 40 | factory: Box::new(factory), |
140 | 40 | states: vec![], |
141 | 40 | allocation_bytes: 0, |
142 | 40 | } |
143 | 40 | } |
144 | | |
145 | | /// Ensure that self.accumulators has total_num_groups |
146 | 66 | fn make_accumulators_if_needed(&mut self, total_num_groups: usize) -> Result<()> { |
147 | 66 | // can't shrink |
148 | 66 | assert!(total_num_groups >= self.states.len()); |
149 | 66 | let vec_size_pre = self.states.allocated_size(); |
150 | 66 | |
151 | 66 | // instantiate new accumulators |
152 | 66 | let new_accumulators = total_num_groups - self.states.len(); |
153 | 66 | for _ in 0..new_accumulators { |
154 | 130 | let accumulator = (self.factory)()?0 ; |
155 | 130 | let state = AccumulatorState::new(accumulator); |
156 | 130 | self.add_allocation(state.size()); |
157 | 130 | self.states.push(state); |
158 | | } |
159 | | |
160 | 66 | self.adjust_allocation(vec_size_pre, self.states.allocated_size()); |
161 | 66 | Ok(()) |
162 | 66 | } |
163 | | |
164 | | /// invokes f(accumulator, values) for each group that has values |
165 | | /// in group_indices. |
166 | | /// |
167 | | /// This function first reorders the input and filter so that |
168 | | /// values for each group_index are contiguous and then invokes f |
169 | | /// on the contiguous ranges, to minimize per-row overhead |
170 | | /// |
171 | | /// ```text |
172 | | /// ┌─────────┐ ┌─────────┐ ┌ ─ ─ ─ ─ ┐ ┌─────────┐ ┌ ─ ─ ─ ─ ┐ |
173 | | /// │ ┌─────┐ │ │ ┌─────┐ │ ┌─────┐ ┏━━━━━┓ │ ┌─────┐ │ ┌─────┐ |
174 | | /// │ │ 2 │ │ │ │ 200 │ │ │ │ t │ │ ┃ 0 ┃ │ │ 200 │ │ │ │ t │ │ |
175 | | /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ ┣━━━━━┫ │ ├─────┤ │ ├─────┤ |
176 | | /// │ │ 2 │ │ │ │ 100 │ │ │ │ f │ │ ┃ 0 ┃ │ │ 300 │ │ │ │ t │ │ |
177 | | /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ ┣━━━━━┫ │ ├─────┤ │ ├─────┤ |
178 | | /// │ │ 0 │ │ │ │ 200 │ │ │ │ t │ │ ┃ 1 ┃ │ │ 200 │ │ │ │NULL │ │ |
179 | | /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ ────────▶ ┣━━━━━┫ │ ├─────┤ │ ├─────┤ |
180 | | /// │ │ 1 │ │ │ │ 200 │ │ │ │NULL │ │ ┃ 2 ┃ │ │ 200 │ │ │ │ t │ │ |
181 | | /// │ ├─────┤ │ │ ├─────┤ │ ├─────┤ ┣━━━━━┫ │ ├─────┤ │ ├─────┤ |
182 | | /// │ │ 0 │ │ │ │ 300 │ │ │ │ t │ │ ┃ 2 ┃ │ │ 100 │ │ │ │ f │ │ |
183 | | /// │ └─────┘ │ │ └─────┘ │ └─────┘ ┗━━━━━┛ │ └─────┘ │ └─────┘ |
184 | | /// └─────────┘ └─────────┘ └ ─ ─ ─ ─ ┘ └─────────┘ └ ─ ─ ─ ─ ┘ |
185 | | /// |
186 | | /// logical group values opt_filter logical group values opt_filter |
187 | | /// |
188 | | /// ``` |
189 | 66 | fn invoke_per_accumulator<F>( |
190 | 66 | &mut self, |
191 | 66 | values: &[ArrayRef], |
192 | 66 | group_indices: &[usize], |
193 | 66 | opt_filter: Option<&BooleanArray>, |
194 | 66 | total_num_groups: usize, |
195 | 66 | f: F, |
196 | 66 | ) -> Result<()> |
197 | 66 | where |
198 | 66 | F: Fn(&mut dyn Accumulator, &[ArrayRef]) -> Result<()>, |
199 | 66 | { |
200 | 66 | self.make_accumulators_if_needed(total_num_groups)?0 ; |
201 | | |
202 | 66 | assert_eq!(values[0].len(), group_indices.len()); |
203 | | |
204 | | // figure out which input rows correspond to which groups. |
205 | | // Note that self.state.indices starts empty for all groups |
206 | | // (it is cleared out below) |
207 | 234 | for (idx, group_index) in group_indices.iter().enumerate()66 { |
208 | 234 | self.states[*group_index].indices.push(idx as u32); |
209 | 234 | } |
210 | | |
211 | | // groups_with_rows holds a list of group indexes that have |
212 | | // any rows that need to be accumulated, stored in order of |
213 | | // group_index |
214 | | |
215 | 66 | let mut groups_with_rows = vec![]; |
216 | 66 | |
217 | 66 | // batch_indices holds indices into values, each group is contiguous |
218 | 66 | let mut batch_indices = vec![]; |
219 | 66 | |
220 | 66 | // offsets[i] is index into batch_indices where the rows for |
221 | 66 | // group_index i starts |
222 | 66 | let mut offsets = vec![0]; |
223 | 66 | |
224 | 66 | let mut offset_so_far = 0; |
225 | 184 | for (group_index, state) in self.states.iter_mut().enumerate()66 { |
226 | 184 | let indices = &state.indices; |
227 | 184 | if indices.is_empty() { |
228 | 22 | continue; |
229 | 162 | } |
230 | 162 | |
231 | 162 | groups_with_rows.push(group_index); |
232 | 162 | batch_indices.extend_from_slice(indices); |
233 | 162 | offset_so_far += indices.len(); |
234 | 162 | offsets.push(offset_so_far); |
235 | | } |
236 | 66 | let batch_indices = batch_indices.into(); |
237 | | |
238 | | // reorder the values and opt_filter by batch_indices so that |
239 | | // all values for each group are contiguous, then invoke the |
240 | | // accumulator once per group with values |
241 | 66 | let values = take_arrays(values, &batch_indices)?0 ; |
242 | 66 | let opt_filter = get_filter_at_indices(opt_filter, &batch_indices)?0 ; |
243 | | |
244 | | // invoke each accumulator with the appropriate rows, first |
245 | | // pulling the input arguments for this group into their own |
246 | | // RecordBatch(es) |
247 | 66 | let iter = groups_with_rows.iter().zip(offsets.windows(2)); |
248 | 66 | |
249 | 66 | let mut sizes_pre = 0; |
250 | 66 | let mut sizes_post = 0; |
251 | 228 | for (&group_idx, offsets162 ) in iter { |
252 | 162 | let state = &mut self.states[group_idx]; |
253 | 162 | sizes_pre += state.size(); |
254 | | |
255 | 162 | let values_to_accumulate = slice_and_maybe_filter( |
256 | 162 | &values, |
257 | 162 | opt_filter.as_ref().map(|f| f.as_boolean()0 ), |
258 | 162 | offsets, |
259 | 162 | )?0 ; |
260 | 162 | f(state.accumulator.as_mut(), &values_to_accumulate)?0 ; |
261 | | |
262 | | // clear out the state so they are empty for next |
263 | | // iteration |
264 | 162 | state.indices.clear(); |
265 | 162 | sizes_post += state.size(); |
266 | | } |
267 | | |
268 | 66 | self.adjust_allocation(sizes_pre, sizes_post); |
269 | 66 | Ok(()) |
270 | 66 | } |
271 | | |
272 | | /// Increment the allocation by `n` |
273 | | /// |
274 | | /// See [`Self::allocation_bytes`] for rationale. |
275 | 176 | fn add_allocation(&mut self, size: usize) { |
276 | 176 | self.allocation_bytes += size; |
277 | 176 | } |
278 | | |
279 | | /// Decrease the allocation by `n` |
280 | | /// |
281 | | /// See [`Self::allocation_bytes`] for rationale. |
282 | 280 | fn free_allocation(&mut self, size: usize) { |
283 | 280 | // use saturating sub to avoid errors if the accumulators |
284 | 280 | // report erronious sizes |
285 | 280 | self.allocation_bytes = self.allocation_bytes.saturating_sub(size) |
286 | 280 | } |
287 | | |
288 | | /// Adjusts the allocation for something that started with |
289 | | /// start_size and now has new_size avoiding overflow |
290 | | /// |
291 | | /// See [`Self::allocation_bytes`] for rationale. |
292 | 196 | fn adjust_allocation(&mut self, old_size: usize, new_size: usize) { |
293 | 196 | if new_size > old_size { |
294 | 46 | self.add_allocation(new_size - old_size) |
295 | | } else { |
296 | 150 | self.free_allocation(old_size - new_size) |
297 | | } |
298 | 196 | } |
299 | | } |
300 | | |
301 | | impl GroupsAccumulator for GroupsAccumulatorAdapter { |
302 | 32 | fn update_batch( |
303 | 32 | &mut self, |
304 | 32 | values: &[ArrayRef], |
305 | 32 | group_indices: &[usize], |
306 | 32 | opt_filter: Option<&BooleanArray>, |
307 | 32 | total_num_groups: usize, |
308 | 32 | ) -> Result<()> { |
309 | 32 | self.invoke_per_accumulator( |
310 | 32 | values, |
311 | 32 | group_indices, |
312 | 32 | opt_filter, |
313 | 32 | total_num_groups, |
314 | 96 | |accumulator, values_to_accumulate| { |
315 | 96 | accumulator.update_batch(values_to_accumulate) |
316 | 96 | }, |
317 | 32 | )?0 ; |
318 | 32 | Ok(()) |
319 | 32 | } |
320 | | |
321 | 12 | fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> { |
322 | 12 | let vec_size_pre = self.states.allocated_size(); |
323 | 12 | |
324 | 12 | let states = emit_to.take_needed(&mut self.states); |
325 | | |
326 | 12 | let results: Vec<ScalarValue> = states |
327 | 12 | .into_iter() |
328 | 24 | .map(|mut state| { |
329 | 24 | self.free_allocation(state.size()); |
330 | 24 | state.accumulator.evaluate() |
331 | 24 | }) |
332 | 12 | .collect::<Result<_>>()?0 ; |
333 | | |
334 | 12 | let result = ScalarValue::iter_to_array(results); |
335 | 12 | |
336 | 12 | self.adjust_allocation(vec_size_pre, self.states.allocated_size()); |
337 | 12 | |
338 | 12 | result |
339 | 12 | } |
340 | | |
341 | | // filtered_null_mask(opt_filter, &values); |
342 | 52 | fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> { |
343 | 52 | let vec_size_pre = self.states.allocated_size(); |
344 | 52 | let states = emit_to.take_needed(&mut self.states); |
345 | 52 | |
346 | 52 | // each accumulator produces a potential vector of values |
347 | 52 | // which we need to form into columns |
348 | 52 | let mut results: Vec<Vec<ScalarValue>> = vec![]; |
349 | | |
350 | 158 | for mut state106 in states { |
351 | 106 | self.free_allocation(state.size()); |
352 | 106 | let accumulator_state = state.accumulator.state()?0 ; |
353 | 106 | results.resize_with(accumulator_state.len(), Vec::new); |
354 | 318 | for (idx, state_val) in accumulator_state.into_iter().enumerate()106 { |
355 | 318 | results[idx].push(state_val); |
356 | 318 | } |
357 | | } |
358 | | |
359 | | // create an array for each intermediate column |
360 | 52 | let arrays = results |
361 | 52 | .into_iter() |
362 | 52 | .map(ScalarValue::iter_to_array) |
363 | 52 | .collect::<Result<Vec<_>>>()?0 ; |
364 | | |
365 | | // double check each array has the same length (aka the |
366 | | // accumulator was implemented correctly |
367 | 52 | if let Some(first_col) = arrays.first() { |
368 | 208 | for arr156 in &arrays { |
369 | 156 | assert_eq!(arr.len(), first_col.len()) |
370 | | } |
371 | 0 | } |
372 | 52 | self.adjust_allocation(vec_size_pre, self.states.allocated_size()); |
373 | 52 | |
374 | 52 | Ok(arrays) |
375 | 52 | } |
376 | | |
377 | 34 | fn merge_batch( |
378 | 34 | &mut self, |
379 | 34 | values: &[ArrayRef], |
380 | 34 | group_indices: &[usize], |
381 | 34 | opt_filter: Option<&BooleanArray>, |
382 | 34 | total_num_groups: usize, |
383 | 34 | ) -> Result<()> { |
384 | 34 | self.invoke_per_accumulator( |
385 | 34 | values, |
386 | 34 | group_indices, |
387 | 34 | opt_filter, |
388 | 34 | total_num_groups, |
389 | 66 | |accumulator, values_to_accumulate| { |
390 | 66 | accumulator.merge_batch(values_to_accumulate)?0 ; |
391 | 66 | Ok(()) |
392 | 66 | }, |
393 | 34 | )?0 ; |
394 | 34 | Ok(()) |
395 | 34 | } |
396 | | |
397 | 208 | fn size(&self) -> usize { |
398 | 208 | self.allocation_bytes |
399 | 208 | } |
400 | | |
401 | 0 | fn convert_to_state( |
402 | 0 | &self, |
403 | 0 | values: &[ArrayRef], |
404 | 0 | opt_filter: Option<&BooleanArray>, |
405 | 0 | ) -> Result<Vec<ArrayRef>> { |
406 | 0 | let num_rows = values[0].len(); |
407 | 0 |
|
408 | 0 | // Each row has its respective group |
409 | 0 | let mut results = vec![]; |
410 | 0 | for row_idx in 0..num_rows { |
411 | | // Create the empty accumulator for converting |
412 | 0 | let mut converted_accumulator = (self.factory)()?; |
413 | | |
414 | | // Convert row to states |
415 | 0 | let values_to_accumulate = |
416 | 0 | slice_and_maybe_filter(values, opt_filter, &[row_idx, row_idx + 1])?; |
417 | 0 | converted_accumulator.update_batch(&values_to_accumulate)?; |
418 | 0 | let states = converted_accumulator.state()?; |
419 | | |
420 | | // Resize results to have enough columns according to the converted states |
421 | 0 | results.resize_with(states.len(), || Vec::with_capacity(num_rows)); |
422 | | |
423 | | // Add the states to results |
424 | 0 | for (idx, state_val) in states.into_iter().enumerate() { |
425 | 0 | results[idx].push(state_val); |
426 | 0 | } |
427 | | } |
428 | | |
429 | 0 | let arrays = results |
430 | 0 | .into_iter() |
431 | 0 | .map(ScalarValue::iter_to_array) |
432 | 0 | .collect::<Result<Vec<_>>>()?; |
433 | | |
434 | 0 | Ok(arrays) |
435 | 0 | } |
436 | | |
437 | 32 | fn supports_convert_to_state(&self) -> bool { |
438 | 32 | true |
439 | 32 | } |
440 | | } |
441 | | |
442 | | /// Extension trait for [`Vec`] to account for allocations. |
443 | | pub trait VecAllocExt { |
444 | | /// Item type. |
445 | | type T; |
446 | | /// Return the amount of memory allocated by this Vec (not |
447 | | /// recursively counting any heap allocations contained within the |
448 | | /// structure). Does not include the size of `self` |
449 | | fn allocated_size(&self) -> usize; |
450 | | } |
451 | | |
452 | | impl<T> VecAllocExt for Vec<T> { |
453 | | type T = T; |
454 | 844 | fn allocated_size(&self) -> usize { |
455 | 844 | std::mem::size_of::<T>() * self.capacity() |
456 | 844 | } |
457 | | } |
458 | | |
459 | 66 | fn get_filter_at_indices( |
460 | 66 | opt_filter: Option<&BooleanArray>, |
461 | 66 | indices: &PrimitiveArray<UInt32Type>, |
462 | 66 | ) -> Result<Option<ArrayRef>> { |
463 | 66 | opt_filter |
464 | 66 | .map(|filter| { |
465 | 0 | compute::take( |
466 | 0 | &filter, indices, None, // None: no index check |
467 | 0 | ) |
468 | 66 | }) |
469 | 66 | .transpose() |
470 | 66 | .map_err(|e| arrow_datafusion_err!(e)0 ) |
471 | 66 | } |
472 | | |
473 | | // Copied from physical-plan |
474 | 162 | pub(crate) fn slice_and_maybe_filter( |
475 | 162 | aggr_array: &[ArrayRef], |
476 | 162 | filter_opt: Option<&BooleanArray>, |
477 | 162 | offsets: &[usize], |
478 | 162 | ) -> Result<Vec<ArrayRef>> { |
479 | 162 | let (offset, length) = (offsets[0], offsets[1] - offsets[0]); |
480 | 162 | let sliced_arrays: Vec<ArrayRef> = aggr_array |
481 | 162 | .iter() |
482 | 390 | .map(|array| array.slice(offset, length)) |
483 | 162 | .collect(); |
484 | | |
485 | 162 | if let Some(f0 ) = filter_opt { |
486 | 0 | let filter = f.slice(offset, length); |
487 | 0 |
|
488 | 0 | sliced_arrays |
489 | 0 | .iter() |
490 | 0 | .map(|array| { |
491 | 0 | compute::filter(&array, &filter).map_err(|e| arrow_datafusion_err!(e)) |
492 | 0 | }) |
493 | 0 | .collect() |
494 | | } else { |
495 | 162 | Ok(sliced_arrays) |
496 | | } |
497 | 162 | } |