/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/bytes.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::aggregates::group_values::GroupValues; |
19 | | use arrow_array::{Array, ArrayRef, OffsetSizeTrait, RecordBatch}; |
20 | | use datafusion_expr::EmitTo; |
21 | | use datafusion_physical_expr_common::binary_map::{ArrowBytesMap, OutputType}; |
22 | | |
23 | | /// A [`GroupValues`] storing single column of Utf8/LargeUtf8/Binary/LargeBinary values |
24 | | /// |
25 | | /// This specialization is significantly faster than using the more general |
26 | | /// purpose `Row`s format |
27 | | pub struct GroupValuesByes<O: OffsetSizeTrait> { |
28 | | /// Map string/binary values to group index |
29 | | map: ArrowBytesMap<O, usize>, |
30 | | /// The total number of groups so far (used to assign group_index) |
31 | | num_groups: usize, |
32 | | } |
33 | | |
34 | | impl<O: OffsetSizeTrait> GroupValuesByes<O> { |
35 | 0 | pub fn new(output_type: OutputType) -> Self { |
36 | 0 | Self { |
37 | 0 | map: ArrowBytesMap::new(output_type), |
38 | 0 | num_groups: 0, |
39 | 0 | } |
40 | 0 | } |
41 | | } |
42 | | |
43 | | impl<O: OffsetSizeTrait> GroupValues for GroupValuesByes<O> { |
44 | 0 | fn intern( |
45 | 0 | &mut self, |
46 | 0 | cols: &[ArrayRef], |
47 | 0 | groups: &mut Vec<usize>, |
48 | 0 | ) -> datafusion_common::Result<()> { |
49 | 0 | assert_eq!(cols.len(), 1); |
50 | | |
51 | | // look up / add entries in the table |
52 | 0 | let arr = &cols[0]; |
53 | 0 |
|
54 | 0 | groups.clear(); |
55 | 0 | self.map.insert_if_new( |
56 | 0 | arr, |
57 | 0 | // called for each new group |
58 | 0 | |_value| { |
59 | 0 | // assign new group index on each insert |
60 | 0 | let group_idx = self.num_groups; |
61 | 0 | self.num_groups += 1; |
62 | 0 | group_idx |
63 | 0 | }, |
64 | 0 | // called for each group |
65 | 0 | |group_idx| { |
66 | 0 | groups.push(group_idx); |
67 | 0 | }, |
68 | 0 | ); |
69 | 0 |
|
70 | 0 | // ensure we assigned a group to for each row |
71 | 0 | assert_eq!(groups.len(), arr.len()); |
72 | 0 | Ok(()) |
73 | 0 | } |
74 | | |
75 | 0 | fn size(&self) -> usize { |
76 | 0 | self.map.size() + std::mem::size_of::<Self>() |
77 | 0 | } |
78 | | |
79 | 0 | fn is_empty(&self) -> bool { |
80 | 0 | self.num_groups == 0 |
81 | 0 | } |
82 | | |
83 | 0 | fn len(&self) -> usize { |
84 | 0 | self.num_groups |
85 | 0 | } |
86 | | |
87 | 0 | fn emit(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> { |
88 | 0 | // Reset the map to default, and convert it into a single array |
89 | 0 | let map_contents = self.map.take().into_state(); |
90 | | |
91 | 0 | let group_values = match emit_to { |
92 | | EmitTo::All => { |
93 | 0 | self.num_groups -= map_contents.len(); |
94 | 0 | map_contents |
95 | | } |
96 | 0 | EmitTo::First(n) if n == self.len() => { |
97 | 0 | self.num_groups -= map_contents.len(); |
98 | 0 | map_contents |
99 | | } |
100 | 0 | EmitTo::First(n) => { |
101 | 0 | // if we only wanted to take the first n, insert the rest back |
102 | 0 | // into the map we could potentially avoid this reallocation, at |
103 | 0 | // the expense of much more complex code. |
104 | 0 | // see https://github.com/apache/datafusion/issues/9195 |
105 | 0 | let emit_group_values = map_contents.slice(0, n); |
106 | 0 | let remaining_group_values = |
107 | 0 | map_contents.slice(n, map_contents.len() - n); |
108 | 0 |
|
109 | 0 | self.num_groups = 0; |
110 | 0 | let mut group_indexes = vec![]; |
111 | 0 | self.intern(&[remaining_group_values], &mut group_indexes)?; |
112 | | |
113 | | // Verify that the group indexes were assigned in the correct order |
114 | 0 | assert_eq!(0, group_indexes[0]); |
115 | | |
116 | 0 | emit_group_values |
117 | | } |
118 | | }; |
119 | | |
120 | 0 | Ok(vec![group_values]) |
121 | 0 | } |
122 | | |
123 | 0 | fn clear_shrink(&mut self, _batch: &RecordBatch) { |
124 | 0 | // in theory we could potentially avoid this reallocation and clear the |
125 | 0 | // contents of the maps, but for now we just reset the map from the beginning |
126 | 0 | self.map.take(); |
127 | 0 | } |
128 | | } |