/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/bytes_view.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use crate::aggregates::group_values::GroupValues; |
19 | | use arrow_array::{Array, ArrayRef, RecordBatch}; |
20 | | use datafusion_expr::EmitTo; |
21 | | use datafusion_physical_expr::binary_map::OutputType; |
22 | | use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewMap; |
23 | | |
24 | | /// A [`GroupValues`] storing single column of Utf8View/BinaryView values |
25 | | /// |
26 | | /// This specialization is significantly faster than using the more general |
27 | | /// purpose `Row`s format |
28 | | pub struct GroupValuesBytesView { |
29 | | /// Map string/binary values to group index |
30 | | map: ArrowBytesViewMap<usize>, |
31 | | /// The total number of groups so far (used to assign group_index) |
32 | | num_groups: usize, |
33 | | } |
34 | | |
35 | | impl GroupValuesBytesView { |
36 | 0 | pub fn new(output_type: OutputType) -> Self { |
37 | 0 | Self { |
38 | 0 | map: ArrowBytesViewMap::new(output_type), |
39 | 0 | num_groups: 0, |
40 | 0 | } |
41 | 0 | } |
42 | | } |
43 | | |
44 | | impl GroupValues for GroupValuesBytesView { |
45 | 0 | fn intern( |
46 | 0 | &mut self, |
47 | 0 | cols: &[ArrayRef], |
48 | 0 | groups: &mut Vec<usize>, |
49 | 0 | ) -> datafusion_common::Result<()> { |
50 | 0 | assert_eq!(cols.len(), 1); |
51 | | |
52 | | // look up / add entries in the table |
53 | 0 | let arr = &cols[0]; |
54 | 0 |
|
55 | 0 | groups.clear(); |
56 | 0 | self.map.insert_if_new( |
57 | 0 | arr, |
58 | 0 | // called for each new group |
59 | 0 | |_value| { |
60 | 0 | // assign new group index on each insert |
61 | 0 | let group_idx = self.num_groups; |
62 | 0 | self.num_groups += 1; |
63 | 0 | group_idx |
64 | 0 | }, |
65 | 0 | // called for each group |
66 | 0 | |group_idx| { |
67 | 0 | groups.push(group_idx); |
68 | 0 | }, |
69 | 0 | ); |
70 | 0 |
|
71 | 0 | // ensure we assigned a group to for each row |
72 | 0 | assert_eq!(groups.len(), arr.len()); |
73 | 0 | Ok(()) |
74 | 0 | } |
75 | | |
76 | 0 | fn size(&self) -> usize { |
77 | 0 | self.map.size() + std::mem::size_of::<Self>() |
78 | 0 | } |
79 | | |
80 | 0 | fn is_empty(&self) -> bool { |
81 | 0 | self.num_groups == 0 |
82 | 0 | } |
83 | | |
84 | 0 | fn len(&self) -> usize { |
85 | 0 | self.num_groups |
86 | 0 | } |
87 | | |
88 | 0 | fn emit(&mut self, emit_to: EmitTo) -> datafusion_common::Result<Vec<ArrayRef>> { |
89 | 0 | // Reset the map to default, and convert it into a single array |
90 | 0 | let map_contents = self.map.take().into_state(); |
91 | | |
92 | 0 | let group_values = match emit_to { |
93 | | EmitTo::All => { |
94 | 0 | self.num_groups -= map_contents.len(); |
95 | 0 | map_contents |
96 | | } |
97 | 0 | EmitTo::First(n) if n == self.len() => { |
98 | 0 | self.num_groups -= map_contents.len(); |
99 | 0 | map_contents |
100 | | } |
101 | 0 | EmitTo::First(n) => { |
102 | 0 | // if we only wanted to take the first n, insert the rest back |
103 | 0 | // into the map we could potentially avoid this reallocation, at |
104 | 0 | // the expense of much more complex code. |
105 | 0 | // see https://github.com/apache/datafusion/issues/9195 |
106 | 0 | let emit_group_values = map_contents.slice(0, n); |
107 | 0 | let remaining_group_values = |
108 | 0 | map_contents.slice(n, map_contents.len() - n); |
109 | 0 |
|
110 | 0 | self.num_groups = 0; |
111 | 0 | let mut group_indexes = vec![]; |
112 | 0 | self.intern(&[remaining_group_values], &mut group_indexes)?; |
113 | | |
114 | | // Verify that the group indexes were assigned in the correct order |
115 | 0 | assert_eq!(0, group_indexes[0]); |
116 | | |
117 | 0 | emit_group_values |
118 | | } |
119 | | }; |
120 | | |
121 | 0 | Ok(vec![group_values]) |
122 | 0 | } |
123 | | |
124 | 0 | fn clear_shrink(&mut self, _batch: &RecordBatch) { |
125 | 0 | // in theory we could potentially avoid this reallocation and clear the |
126 | 0 | // contents of the maps, but for now we just reset the map from the beginning |
127 | 0 | self.map.take(); |
128 | 0 | } |
129 | | } |