Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/aggregates/group_values/mod.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! [`GroupValues`] trait for storing and interning group keys
19
20
use arrow::record_batch::RecordBatch;
21
use arrow_array::{downcast_primitive, ArrayRef};
22
use arrow_schema::{DataType, SchemaRef};
23
use bytes_view::GroupValuesBytesView;
24
use datafusion_common::Result;
25
26
pub(crate) mod primitive;
27
use datafusion_expr::EmitTo;
28
use primitive::GroupValuesPrimitive;
29
30
mod column;
31
mod row;
32
use column::GroupValuesColumn;
33
use row::GroupValuesRows;
34
35
mod bytes;
36
mod bytes_view;
37
use bytes::GroupValuesByes;
38
use datafusion_physical_expr::binary_map::OutputType;
39
40
mod group_column;
41
mod null_builder;
42
43
/// Stores the group values during hash aggregation.
44
///
45
/// # Background
46
///
47
/// In a query such as `SELECT a, b, count(*) FROM t GROUP BY a, b`, the group values
48
/// identify each group, and correspond to all the distinct values of `(a,b)`.
49
///
50
/// ```sql
51
/// -- Input has 4 rows with 3 distinct combinations of (a,b) ("groups")
52
/// create table t(a int, b varchar)
53
/// as values (1, 'a'), (2, 'b'), (1, 'a'), (3, 'c');
54
///
55
/// select a, b, count(*) from t group by a, b;
56
/// ----
57
/// 1 a 2
58
/// 2 b 1
59
/// 3 c 1
60
/// ```
61
///
62
/// # Design
63
///
64
/// Managing group values is a performance critical operation in hash
65
/// aggregation. The major operations are:
66
///
67
/// 1. Intern: Quickly finding existing and adding new group values
68
/// 2. Emit: Returning the group values as an array
69
///
70
/// There are multiple specialized implementations of this trait optimized for
71
/// different data types and number of columns, optimized for these operations.
72
/// See [`new_group_values`] for details.
73
///
74
/// # Group Ids
75
///
76
/// Each distinct group in a hash aggregation is identified by a unique group id
77
/// (usize) which is assigned by instances of this trait. Group ids are
78
/// continuous without gaps, starting from 0.
79
pub trait GroupValues: Send {
80
    /// Calculates the group id for each input row of `cols`, assigning new
81
    /// group ids as necessary.
82
    ///
83
    /// When the function returns, `groups`  must contain the group id for each
84
    /// row in `cols`.
85
    ///
86
    /// If a row has the same value as a previous row, the same group id is
87
    /// assigned. If a row has a new value, the next available group id is
88
    /// assigned.
89
    fn intern(&mut self, cols: &[ArrayRef], groups: &mut Vec<usize>) -> Result<()>;
90
91
    /// Returns the number of bytes of memory used by this [`GroupValues`]
92
    fn size(&self) -> usize;
93
94
    /// Returns true if this [`GroupValues`] is empty
95
    fn is_empty(&self) -> bool;
96
97
    /// The number of values (distinct group values) stored in this [`GroupValues`]
98
    fn len(&self) -> usize;
99
100
    /// Emits the group values
101
    fn emit(&mut self, emit_to: EmitTo) -> Result<Vec<ArrayRef>>;
102
103
    /// Clear the contents and shrink the capacity to the size of the batch (free up memory usage)
104
    fn clear_shrink(&mut self, batch: &RecordBatch);
105
}
106
107
/// Return a specialized implementation of [`GroupValues`] for the given schema.
108
70
pub fn new_group_values(schema: SchemaRef) -> Result<Box<dyn GroupValues>> {
109
70
    if schema.fields.len() == 1 {
110
57
        let d = schema.fields[0].data_type();
111
112
        macro_rules! downcast_helper {
113
            ($t:ty, $d:ident) => {
114
                return Ok(Box::new(GroupValuesPrimitive::<$t>::new($d.clone())))
115
            };
116
        }
117
118
0
        downcast_primitive! {
119
0
            d => (downcast_helper, d),
120
1
            _ => {}
121
1
        }
122
1
123
1
        match d {
124
            DataType::Utf8 => {
125
0
                return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Utf8)));
126
            }
127
            DataType::LargeUtf8 => {
128
0
                return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Utf8)));
129
            }
130
            DataType::Utf8View => {
131
0
                return Ok(Box::new(GroupValuesBytesView::new(OutputType::Utf8View)));
132
            }
133
            DataType::Binary => {
134
0
                return Ok(Box::new(GroupValuesByes::<i32>::new(OutputType::Binary)));
135
            }
136
            DataType::LargeBinary => {
137
0
                return Ok(Box::new(GroupValuesByes::<i64>::new(OutputType::Binary)));
138
            }
139
            DataType::BinaryView => {
140
0
                return Ok(Box::new(GroupValuesBytesView::new(OutputType::BinaryView)));
141
            }
142
1
            _ => {}
143
        }
144
13
    }
145
146
14
    if GroupValuesColumn::supported_schema(schema.as_ref()) {
147
13
        Ok(Box::new(GroupValuesColumn::try_new(schema)
?0
))
148
    } else {
149
1
        Ok(Box::new(GroupValuesRows::try_new(schema)
?0
))
150
    }
151
70
}