/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! [`GroupValues`] trait for storing and interning group keys |
19 | | |
20 | | use arrow::record_batch::RecordBatch; |
21 | | use arrow_array::{downcast_primitive, ArrayRef}; |
22 | | use arrow_schema::{DataType, SchemaRef}; |
23 | | use bytes_view::GroupValuesBytesView; |
24 | | use datafusion_common::Result; |
25 | | |
26 | | pub(crate) mod primitive; |
27 | | use datafusion_expr::EmitTo; |
28 | | use primitive::GroupValuesPrimitive; |
29 | | |
30 | | mod column; |
31 | | mod row; |
32 | | use column::GroupValuesColumn; |
33 | | use row::GroupValuesRows; |
34 | | |
35 | | mod bytes; |
36 | | mod bytes_view; |
37 | | use bytes::GroupValuesByes; |
38 | | use datafusion_physical_expr::binary_map::OutputType; |
39 | | |
40 | | mod group_column; |
41 | | mod null_builder; |
42 | | |
43 | | /// Stores the group values during hash aggregation. |
44 | | /// |
45 | | /// # Background |
46 | | /// |
47 | | /// In a query such as `SELECT a, b, count(*) FROM t GROUP BY a, b`, the group values |
48 | | /// identify each group, and correspond to all the distinct values of `(a,b)`. |
49 | | /// |
50 | | /// ```sql |
51 | | /// -- Input has 4 rows with 3 distinct combinations of (a,b) ("groups") |
52 | | /// create table t(a int, b varchar) |
53 | | /// as values (1, 'a'), (2, 'b'), (1, 'a'), (3, 'c'); |
54 | | /// |
55 | | /// select a, b, count(*) from t group by a, b; |
56 | | /// ---- |
57 | | /// 1 a 2 |
58 | | /// 2 b 1 |
59 | | /// 3 c 1 |
60 | | /// ``` |
61 | | /// |
62 | | /// # Design |
63 | | /// |
64 | | /// Managing group values is a performance critical operation in hash |
65 | | /// aggregation. The major operations are: |
66 | | /// |
67 | | /// 1. Intern: Quickly finding existing and adding new group values |
68 | | /// 2. Emit: Returning the group values as an array |
69 | | /// |
70 | | /// There are multiple specialized implementations of this trait optimized for |
71 | | /// different data types and number of columns, optimized for these operations. |
72 | | /// See [`new_group_values`] for details. |
73 | | /// |
74 | | /// # Group Ids |
75 | | /// |
76 | | /// Each distinct group in a hash aggregation is identified by a unique group id |
77 | | /// (usize) which is assigned by instances of this trait. Group ids are |
78 | | /// continuous without gaps, starting from 0. |
79 | | pub trait GroupValues: Send { |
80 | | /// Calculates the group id for each input row of `cols`, assigning new |
81 | | /// group ids as necessary. |
82 | | /// |
83 | | /// When the function returns, `groups` must contain the group id for each |
84 | | /// row in `cols`. |
85 | | /// |
86 | | /// If a row has the same value as a previous row, the same group id is |
87 | | /// assigned. If a row has a new value, the next available group id is |
88 | | /// assigned. |
89 | | fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()>; |
90 | | |
91 | | /// Returns the number of bytes of memory used by this [`GroupValues`] |
92 | | fn size(&self) -> usize; |
93 | | |
94 | | /// Returns true if this [`GroupValues`] is empty |
95 | | fn is_empty(&self) -> bool; |
96 | | |
97 | | /// The number of values (distinct group values) stored in this [`GroupValues`] |
98 | | fn len(&self) -> usize; |
99 | | |
100 | | /// Emits the group values |
101 | | fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>; |
102 | | |
103 | | /// Clear the contents and shrink the capacity to the size of the batch (free up memory usage) |
104 | | fn clear_shrink(&mut self, batch: &RecordBatch); |
105 | | } |
106 | | |
107 | | /// Return a specialized implementation of [`GroupValues`] for the given schema. |
108 | 70 | pub fn new_group_values(schema: SchemaRef) -> Result<Box<dyn GroupValues>> { |
109 | 70 | if schema.fields.len() == 1 { |
110 | 57 | let d = schema.fields[0].data_type(); |
111 | | |
112 | | macro_rules! downcast_helper { |
113 | | ($t:ty, $d:ident) => { |
114 | | return Ok(Box::new(GroupValuesPrimitive::<$t>::new($d.clone()))) |
115 | | }; |
116 | | } |
117 | | |
118 | 0 | downcast_primitive! { |
119 | 0 | d => (downcast_helper, d), |
120 | 1 | _ => {} |
121 | 1 | } |
122 | 1 | |
123 | 1 | match d { |
124 | | DataType::Utf8 => { |
125 | 0 | return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Utf8))); |
126 | | } |
127 | | DataType::LargeUtf8 => { |
128 | 0 | return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Utf8))); |
129 | | } |
130 | | DataType::Utf8View => { |
131 | 0 | return Ok(Box::new(GroupValuesBytesView::new(OutputType::Utf8View))); |
132 | | } |
133 | | DataType::Binary => { |
134 | 0 | return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Binary))); |
135 | | } |
136 | | DataType::LargeBinary => { |
137 | 0 | return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Binary))); |
138 | | } |
139 | | DataType::BinaryView => { |
140 | 0 | return Ok(Box::new(GroupValuesBytesView::new(OutputType::BinaryView))); |
141 | | } |
142 | 1 | _ => {} |
143 | | } |
144 | 13 | } |
145 | | |
146 | 14 | if GroupValuesColumn::supported_schema(schema.as_ref()) { |
147 | 13 | Ok(Box::new(GroupValuesColumn::try_new(schema)?0 )) |
148 | | } else { |
149 | 1 | Ok(Box::new(GroupValuesRows::try_new(schema)?0 )) |
150 | | } |
151 | 70 | } |