Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate-common/src/aggregate/groups_accumulator.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! Utilities for implementing GroupsAccumulator
19
//! Adapter that makes [`GroupsAccumulator`] out of [`Accumulator`]
20
21
pub mod accumulate;
22
pub mod bool_op;
23
pub mod nulls;
24
pub mod prim_op;
25
26
use arrow::{
27
    array::{ArrayRef, AsArray, BooleanArray, PrimitiveArray},
28
    compute,
29
    datatypes::UInt32Type,
30
};
31
use datafusion_common::{
32
    arrow_datafusion_err, utils::take_arrays, DataFusionError, Result, ScalarValue,
33
};
34
use datafusion_expr_common::accumulator::Accumulator;
35
use datafusion_expr_common::groups_accumulator::{EmitTo, GroupsAccumulator};
36
37
/// An adapter that implements [`GroupsAccumulator`] for any [`Accumulator`]
38
///
39
/// While [`Accumulator`] are simpler to implement and can support
40
/// more general calculations (like retractable window functions),
41
/// they are not as fast as a specialized `GroupsAccumulator`. This
42
/// interface bridges the gap so the group by operator only operates
43
/// in terms of [`Accumulator`].
44
///
45
/// Internally, this adapter creates a new [`Accumulator`] for each group which
46
/// stores the state for that group. This both requires an allocation for each
47
/// Accumulator, internal indices, as well as whatever internal allocations the
48
/// Accumulator itself requires.
49
///
50
/// For example, a `MinAccumulator` that computes the minimum string value with
51
/// a [`ScalarValue::Utf8`]. That will require at least two allocations per group
52
/// (one for the `MinAccumulator` and one for the `ScalarValue::Utf8`).
53
///
54
/// ```text
55
///                       ┌─────────────────────────────────┐
56
///                       │MinAccumulator {                 │
57
///                ┌─────▶│ min: ScalarValue::Utf8("A")     │───────┐
58
///                │      │}                                │       │
59
///                │      └─────────────────────────────────┘       └───────▶   "A"
60
///    ┌─────┐     │      ┌─────────────────────────────────┐
61
///    │  0  │─────┘      │MinAccumulator {                 │
62
///    ├─────┤     ┌─────▶│ min: ScalarValue::Utf8("Z")     │───────────────▶   "Z"
63
///    │  1  │─────┘      │}                                │
64
///    └─────┘            └─────────────────────────────────┘                   ...
65
///      ...                 ...
66
///    ┌─────┐            ┌────────────────────────────────┐
67
///    │ N-2 │            │MinAccumulator {                │
68
///    ├─────┤            │  min: ScalarValue::Utf8("A")   │────────────────▶   "A"
69
///    │ N-1 │─────┐      │}                               │
70
///    └─────┘     │      └────────────────────────────────┘
71
///                │      ┌────────────────────────────────┐        ┌───────▶   "Q"
72
///                │      │MinAccumulator {                │        │
73
///                └─────▶│  min: ScalarValue::Utf8("Q")   │────────┘
74
///                       │}                               │
75
///                       └────────────────────────────────┘
76
///
77
///
78
///  Logical group         Current Min/Max value for that group stored
79
///     number             as a ScalarValue which points to an
80
///                        indivdually allocated String
81
///
82
///```
83
///
84
/// # Optimizations
85
///
86
/// The adapter minimizes the number of calls to [`Accumulator::update_batch`]
87
/// by first collecting the input rows for each group into a contiguous array
88
/// using [`compute::take`]
89
///
90
pub struct GroupsAccumulatorAdapter {
91
    factory: Box<dyn Fn() -> Result<Box<dyn Accumulator>> + Send>,
92
93
    /// state for each group, stored in group_index order
94
    states: Vec<AccumulatorState>,
95
96
    /// Current memory usage, in bytes.
97
    ///
98
    /// Note this is incrementally updated with deltas to avoid the
99
    /// call to size() being a bottleneck. We saw size() being a
100
    /// bottleneck in earlier implementations when there were many
101
    /// distinct groups.
102
    allocation_bytes: usize,
103
}
104
105
struct AccumulatorState {
106
    /// [`Accumulator`] that stores the per-group state
107
    accumulator: Box<dyn Accumulator>,
108
109
    /// scratch space: indexes in the input array that will be fed to
110
    /// this accumulator. Stores indexes as `u32` to match the arrow
111
    /// `take` kernel input.
112
    indices: Vec<u32>,
113
}
114
115
impl AccumulatorState {
116
130
    fn new(accumulator: Box<dyn Accumulator>) -> Self {
117
130
        Self {
118
130
            accumulator,
119
130
            indices: vec![],
120
130
        }
121
130
    }
122
123
    /// Returns the amount of memory taken by this structure and its accumulator
124
584
    fn size(&self) -> usize {
125
584
        self.accumulator.size()
126
584
            + std::mem::size_of_val(self)
127
584
            + self.indices.allocated_size()
128
584
    }
129
}
130
131
impl GroupsAccumulatorAdapter {
132
    /// Create a new adapter that will create a new [`Accumulator`]
133
    /// for each group, using the specified factory function
134
40
    pub fn new<F>(factory: F) -> Self
135
40
    where
136
40
        F: Fn() -> Result<Box<dyn Accumulator>> + Send + 'static,
137
40
    {
138
40
        Self {
139
40
            factory: Box::new(factory),
140
40
            states: vec![],
141
40
            allocation_bytes: 0,
142
40
        }
143
40
    }
144
145
    /// Ensure that self.accumulators has total_num_groups
146
66
    fn make_accumulators_if_needed(&mut self, total_num_groups: usize) -> Result<()> {
147
66
        // can't shrink
148
66
        assert!(total_num_groups >= self.states.len());
149
66
        let vec_size_pre = self.states.allocated_size();
150
66
151
66
        // instantiate new accumulators
152
66
        let new_accumulators = total_num_groups - self.states.len();
153
66
        for _ in 0..new_accumulators {
154
130
            let accumulator = (self.factory)()
?0
;
155
130
            let state = AccumulatorState::new(accumulator);
156
130
            self.add_allocation(state.size());
157
130
            self.states.push(state);
158
        }
159
160
66
        self.adjust_allocation(vec_size_pre, self.states.allocated_size());
161
66
        Ok(())
162
66
    }
163
164
    /// invokes f(accumulator, values) for each group that has values
165
    /// in group_indices.
166
    ///
167
    /// This function first reorders the input and filter so that
168
    /// values for each group_index are contiguous and then invokes f
169
    /// on the contiguous ranges, to minimize per-row overhead
170
    ///
171
    /// ```text
172
    /// ┌─────────┐   ┌─────────┐   ┌ ─ ─ ─ ─ ┐                       ┌─────────┐   ┌ ─ ─ ─ ─ ┐
173
    /// │ ┌─────┐ │   │ ┌─────┐ │     ┌─────┐              ┏━━━━━┓    │ ┌─────┐ │     ┌─────┐
174
    /// │ │  2  │ │   │ │ 200 │ │   │ │  t  │ │            ┃  0  ┃    │ │ 200 │ │   │ │  t  │ │
175
    /// │ ├─────┤ │   │ ├─────┤ │     ├─────┤              ┣━━━━━┫    │ ├─────┤ │     ├─────┤
176
    /// │ │  2  │ │   │ │ 100 │ │   │ │  f  │ │            ┃  0  ┃    │ │ 300 │ │   │ │  t  │ │
177
    /// │ ├─────┤ │   │ ├─────┤ │     ├─────┤              ┣━━━━━┫    │ ├─────┤ │     ├─────┤
178
    /// │ │  0  │ │   │ │ 200 │ │   │ │  t  │ │            ┃  1  ┃    │ │ 200 │ │   │ │NULL │ │
179
    /// │ ├─────┤ │   │ ├─────┤ │     ├─────┤   ────────▶  ┣━━━━━┫    │ ├─────┤ │     ├─────┤
180
    /// │ │  1  │ │   │ │ 200 │ │   │ │NULL │ │            ┃  2  ┃    │ │ 200 │ │   │ │  t  │ │
181
    /// │ ├─────┤ │   │ ├─────┤ │     ├─────┤              ┣━━━━━┫    │ ├─────┤ │     ├─────┤
182
    /// │ │  0  │ │   │ │ 300 │ │   │ │  t  │ │            ┃  2  ┃    │ │ 100 │ │   │ │  f  │ │
183
    /// │ └─────┘ │   │ └─────┘ │     └─────┘              ┗━━━━━┛    │ └─────┘ │     └─────┘
184
    /// └─────────┘   └─────────┘   └ ─ ─ ─ ─ ┘                       └─────────┘   └ ─ ─ ─ ─ ┘
185
    ///
186
    /// logical group   values      opt_filter           logical group  values       opt_filter
187
    ///
188
    /// ```
189
66
    fn invoke_per_accumulator<F>(
190
66
        &mut self,
191
66
        values: &[ArrayRef],
192
66
        group_indices: &[usize],
193
66
        opt_filter: Option<&BooleanArray>,
194
66
        total_num_groups: usize,
195
66
        f: F,
196
66
    ) -> Result<()>
197
66
    where
198
66
        F: Fn(&mut dyn Accumulator, &[ArrayRef]) -> Result<()>,
199
66
    {
200
66
        self.make_accumulators_if_needed(total_num_groups)
?0
;
201
202
66
        assert_eq!(values[0].len(), group_indices.len());
203
204
        // figure out which input rows correspond to which groups.
205
        // Note that self.state.indices starts empty for all groups
206
        // (it is cleared out below)
207
234
        for (idx, group_index) in 
group_indices.iter().enumerate()66
{
208
234
            self.states[*group_index].indices.push(idx as u32);
209
234
        }
210
211
        // groups_with_rows holds a list of group indexes that have
212
        // any rows that need to be accumulated, stored in order of
213
        // group_index
214
215
66
        let mut groups_with_rows = vec![];
216
66
217
66
        // batch_indices holds indices into values, each group is contiguous
218
66
        let mut batch_indices = vec![];
219
66
220
66
        // offsets[i] is index into batch_indices where the rows for
221
66
        // group_index i starts
222
66
        let mut offsets = vec![0];
223
66
224
66
        let mut offset_so_far = 0;
225
184
        for (group_index, state) in 
self.states.iter_mut().enumerate()66
{
226
184
            let indices = &state.indices;
227
184
            if indices.is_empty() {
228
22
                continue;
229
162
            }
230
162
231
162
            groups_with_rows.push(group_index);
232
162
            batch_indices.extend_from_slice(indices);
233
162
            offset_so_far += indices.len();
234
162
            offsets.push(offset_so_far);
235
        }
236
66
        let batch_indices = batch_indices.into();
237
238
        // reorder the values and opt_filter by batch_indices so that
239
        // all values for each group are contiguous, then invoke the
240
        // accumulator once per group with values
241
66
        let values = take_arrays(values, &batch_indices)
?0
;
242
66
        let opt_filter = get_filter_at_indices(opt_filter, &batch_indices)
?0
;
243
244
        // invoke each accumulator with the appropriate rows, first
245
        // pulling the input arguments for this group into their own
246
        // RecordBatch(es)
247
66
        let iter = groups_with_rows.iter().zip(offsets.windows(2));
248
66
249
66
        let mut sizes_pre = 0;
250
66
        let mut sizes_post = 0;
251
228
        for (&
group_idx, offsets162
) in iter {
252
162
            let state = &mut self.states[group_idx];
253
162
            sizes_pre += state.size();
254
255
162
            let values_to_accumulate = slice_and_maybe_filter(
256
162
                &values,
257
162
                opt_filter.as_ref().map(|f| 
f.as_boolean()0
),
258
162
                offsets,
259
162
            )
?0
;
260
162
            f(state.accumulator.as_mut(), &values_to_accumulate)
?0
;
261
262
            // clear out the state so they are empty for next
263
            // iteration
264
162
            state.indices.clear();
265
162
            sizes_post += state.size();
266
        }
267
268
66
        self.adjust_allocation(sizes_pre, sizes_post);
269
66
        Ok(())
270
66
    }
271
272
    /// Increment the allocation by `n`
273
    ///
274
    /// See [`Self::allocation_bytes`] for rationale.
275
176
    fn add_allocation(&mut self, size: usize) {
276
176
        self.allocation_bytes += size;
277
176
    }
278
279
    /// Decrease the allocation by `n`
280
    ///
281
    /// See [`Self::allocation_bytes`] for rationale.
282
280
    fn free_allocation(&mut self, size: usize) {
283
280
        // use saturating sub to avoid errors if the accumulators
284
280
        // report erronious sizes
285
280
        self.allocation_bytes = self.allocation_bytes.saturating_sub(size)
286
280
    }
287
288
    /// Adjusts the allocation for something that started with
289
    /// start_size and now has new_size avoiding overflow
290
    ///
291
    /// See [`Self::allocation_bytes`] for rationale.
292
196
    fn adjust_allocation(&mut self, old_size: usize, new_size: usize) {
293
196
        if new_size > old_size {
294
46
            self.add_allocation(new_size - old_size)
295
        } else {
296
150
            self.free_allocation(old_size - new_size)
297
        }
298
196
    }
299
}
300
301
impl GroupsAccumulator for GroupsAccumulatorAdapter {
302
32
    fn update_batch(
303
32
        &mut self,
304
32
        values: &[ArrayRef],
305
32
        group_indices: &[usize],
306
32
        opt_filter: Option<&BooleanArray>,
307
32
        total_num_groups: usize,
308
32
    ) -> Result<()> {
309
32
        self.invoke_per_accumulator(
310
32
            values,
311
32
            group_indices,
312
32
            opt_filter,
313
32
            total_num_groups,
314
96
            |accumulator, values_to_accumulate| {
315
96
                accumulator.update_batch(values_to_accumulate)
316
96
            },
317
32
        )
?0
;
318
32
        Ok(())
319
32
    }
320
321
12
    fn evaluate(&mut self, emit_to: EmitTo) -> Result<ArrayRef> {
322
12
        let vec_size_pre = self.states.allocated_size();
323
12
324
12
        let states = emit_to.take_needed(&mut self.states);
325
326
12
        let results: Vec<ScalarValue> = states
327
12
            .into_iter()
328
24
            .map(|mut state| {
329
24
                self.free_allocation(state.size());
330
24
                state.accumulator.evaluate()
331
24
            })
332
12
            .collect::<Result<_>>()
?0
;
333
334
12
        let result = ScalarValue::iter_to_array(results);
335
12
336
12
        self.adjust_allocation(vec_size_pre, self.states.allocated_size());
337
12
338
12
        result
339
12
    }
340
341
    // filtered_null_mask(opt_filter, &values);
342
52
    fn state(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>> {
343
52
        let vec_size_pre = self.states.allocated_size();
344
52
        let states = emit_to.take_needed(&mut self.states);
345
52
346
52
        // each accumulator produces a potential vector of values
347
52
        // which we need to form into columns
348
52
        let mut results: Vec<Vec<ScalarValue>> = vec![];
349
350
158
        for 
mut state106
in states {
351
106
            self.free_allocation(state.size());
352
106
            let accumulator_state = state.accumulator.state()
?0
;
353
106
            results.resize_with(accumulator_state.len(), Vec::new);
354
318
            for (idx, state_val) in 
accumulator_state.into_iter().enumerate()106
{
355
318
                results[idx].push(state_val);
356
318
            }
357
        }
358
359
        // create an array for each intermediate column
360
52
        let arrays = results
361
52
            .into_iter()
362
52
            .map(ScalarValue::iter_to_array)
363
52
            .collect::<Result<Vec<_>>>()
?0
;
364
365
        // double check each array has the same length (aka the
366
        // accumulator was implemented correctly
367
52
        if let Some(first_col) = arrays.first() {
368
208
            for 
arr156
in &arrays {
369
156
                assert_eq!(arr.len(), first_col.len())
370
            }
371
0
        }
372
52
        self.adjust_allocation(vec_size_pre, self.states.allocated_size());
373
52
374
52
        Ok(arrays)
375
52
    }
376
377
34
    fn merge_batch(
378
34
        &mut self,
379
34
        values: &[ArrayRef],
380
34
        group_indices: &[usize],
381
34
        opt_filter: Option<&BooleanArray>,
382
34
        total_num_groups: usize,
383
34
    ) -> Result<()> {
384
34
        self.invoke_per_accumulator(
385
34
            values,
386
34
            group_indices,
387
34
            opt_filter,
388
34
            total_num_groups,
389
66
            |accumulator, values_to_accumulate| {
390
66
                accumulator.merge_batch(values_to_accumulate)
?0
;
391
66
                Ok(())
392
66
            },
393
34
        )
?0
;
394
34
        Ok(())
395
34
    }
396
397
208
    fn size(&self) -> usize {
398
208
        self.allocation_bytes
399
208
    }
400
401
0
    fn convert_to_state(
402
0
        &self,
403
0
        values: &[ArrayRef],
404
0
        opt_filter: Option<&BooleanArray>,
405
0
    ) -> Result<Vec<ArrayRef>> {
406
0
        let num_rows = values[0].len();
407
0
408
0
        // Each row has its respective group
409
0
        let mut results = vec![];
410
0
        for row_idx in 0..num_rows {
411
            // Create the empty accumulator for converting
412
0
            let mut converted_accumulator = (self.factory)()?;
413
414
            // Convert row to states
415
0
            let values_to_accumulate =
416
0
                slice_and_maybe_filter(values, opt_filter, &[row_idx, row_idx + 1])?;
417
0
            converted_accumulator.update_batch(&values_to_accumulate)?;
418
0
            let states = converted_accumulator.state()?;
419
420
            // Resize results to have enough columns according to the converted states
421
0
            results.resize_with(states.len(), || Vec::with_capacity(num_rows));
422
423
            // Add the states to results
424
0
            for (idx, state_val) in states.into_iter().enumerate() {
425
0
                results[idx].push(state_val);
426
0
            }
427
        }
428
429
0
        let arrays = results
430
0
            .into_iter()
431
0
            .map(ScalarValue::iter_to_array)
432
0
            .collect::<Result<Vec<_>>>()?;
433
434
0
        Ok(arrays)
435
0
    }
436
437
32
    fn supports_convert_to_state(&self) -> bool {
438
32
        true
439
32
    }
440
}
441
442
/// Extension trait for [`Vec`] to account for allocations.
443
pub trait VecAllocExt {
444
    /// Item type.
445
    type T;
446
    /// Return the amount of memory allocated by this Vec (not
447
    /// recursively counting any heap allocations contained within the
448
    /// structure). Does not include the size of `self`
449
    fn allocated_size(&self) -> usize;
450
}
451
452
impl<T> VecAllocExt for Vec<T> {
453
    type T = T;
454
844
    fn allocated_size(&self) -> usize {
455
844
        std::mem::size_of::<T>() * self.capacity()
456
844
    }
457
}
458
459
66
fn get_filter_at_indices(
460
66
    opt_filter: Option<&BooleanArray>,
461
66
    indices: &PrimitiveArray<UInt32Type>,
462
66
) -> Result<Option<ArrayRef>> {
463
66
    opt_filter
464
66
        .map(|filter| {
465
0
            compute::take(
466
0
                &filter, indices, None, // None: no index check
467
0
            )
468
66
        })
469
66
        .transpose()
470
66
        .map_err(|e| 
arrow_datafusion_err!(e)0
)
471
66
}
472
473
// Copied from physical-plan
474
162
pub(crate) fn slice_and_maybe_filter(
475
162
    aggr_array: &[ArrayRef],
476
162
    filter_opt: Option<&BooleanArray>,
477
162
    offsets: &[usize],
478
162
) -> Result<Vec<ArrayRef>> {
479
162
    let (offset, length) = (offsets[0], offsets[1] - offsets[0]);
480
162
    let sliced_arrays: Vec<ArrayRef> = aggr_array
481
162
        .iter()
482
390
        .map(|array| array.slice(offset, length))
483
162
        .collect();
484
485
162
    if let Some(
f0
) = filter_opt {
486
0
        let filter = f.slice(offset, length);
487
0
488
0
        sliced_arrays
489
0
            .iter()
490
0
            .map(|array| {
491
0
                compute::filter(&array, &filter).map_err(|e| arrow_datafusion_err!(e))
492
0
            })
493
0
            .collect()
494
    } else {
495
162
        Ok(sliced_arrays)
496
    }
497
162
}