Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/bytes.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use crate::aggregates::group_values::GroupValues;
19
use arrow_array::{Array, ArrayRef, OffsetSizeTrait, RecordBatch};
20
use datafusion_expr::EmitTo;
21
use datafusion_physical_expr_common::binary_map::{ArrowBytesMap, OutputType};
22
23
/// A [`GroupValues`] storing single column of Utf8/LargeUtf8/Binary/LargeBinary values
24
///
25
/// This specialization is significantly faster than using the more general
26
/// purpose `Row`s format
27
pub struct GroupValuesByes<O: OffsetSizeTrait> {
28
    /// Map string/binary values to group index
29
    map: ArrowBytesMap<O, usize>,
30
    /// The total number of groups so far (used to assign group_index)
31
    num_groups: usize,
32
}
33
34
impl<O: OffsetSizeTrait> GroupValuesByes<O> {
35
0
    pub fn new(output_type: OutputType) -> Self {
36
0
        Self {
37
0
            map: ArrowBytesMap::new(output_type),
38
0
            num_groups: 0,
39
0
        }
40
0
    }
41
}
42
43
impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> {
44
0
    fn intern(
45
0
        &mut self,
46
0
        cols: &[ArrayRef],
47
0
        groups: &mut Vec<usize>,
48
0
    ) -> datafusion_common::Result<()> {
49
0
        assert_eq!(cols.len(), 1);
50
51
        // look up / add entries in the table
52
0
        let arr = &cols[0];
53
0
54
0
        groups.clear();
55
0
        self.map.insert_if_new(
56
0
            arr,
57
0
            // called for each new group
58
0
            |_value| {
59
0
                // assign new group index on each insert
60
0
                let group_idx = self.num_groups;
61
0
                self.num_groups += 1;
62
0
                group_idx
63
0
            },
64
0
            // called for each group
65
0
            |group_idx| {
66
0
                groups.push(group_idx);
67
0
            },
68
0
        );
69
0
70
0
        // ensure we assigned a group to for each row
71
0
        assert_eq!(groups.len(), arr.len());
72
0
        Ok(())
73
0
    }
74
75
0
    fn size(&self) -> usize {
76
0
        self.map.size() + std::mem::size_of::<Self>()
77
0
    }
78
79
0
    fn is_empty(&self) -> bool {
80
0
        self.num_groups == 0
81
0
    }
82
83
0
    fn len(&self) -> usize {
84
0
        self.num_groups
85
0
    }
86
87
0
    fn emit(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> {
88
0
        // Reset the map to default, and convert it into a single array
89
0
        let map_contents = self.map.take().into_state();
90
91
0
        let group_values = match emit_to {
92
            EmitTo::All => {
93
0
                self.num_groups -= map_contents.len();
94
0
                map_contents
95
            }
96
0
            EmitTo::First(n) if n == self.len() => {
97
0
                self.num_groups -= map_contents.len();
98
0
                map_contents
99
            }
100
0
            EmitTo::First(n) => {
101
0
                // if we only wanted to take the first n, insert the rest back
102
0
                // into the map we could potentially avoid this reallocation, at
103
0
                // the expense of much more complex code.
104
0
                // see https://github.com/apache/datafusion/issues/9195
105
0
                let emit_group_values = map_contents.slice(0, n);
106
0
                let remaining_group_values =
107
0
                    map_contents.slice(n, map_contents.len() - n);
108
0
109
0
                self.num_groups = 0;
110
0
                let mut group_indexes = vec![];
111
0
                self.intern(&[remaining_group_values], &mut group_indexes)?;
112
113
                // Verify that the group indexes were assigned in the correct order
114
0
                assert_eq!(0, group_indexes[0]);
115
116
0
                emit_group_values
117
            }
118
        };
119
120
0
        Ok(vec![group_values])
121
0
    }
122
123
0
    fn clear_shrink(&mut self, _batch: &RecordBatch) {
124
0
        // in theory we could potentially avoid this reallocation and clear the
125
0
        // contents of the maps, but for now we just reset the map from the beginning
126
0
        self.map.take();
127
0
    }
128
}