Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! [`BytesDistinctCountAccumulator`] for Utf8/LargeUtf8/Binary/LargeBinary values
19
20
use arrow::array::{ArrayRef, OffsetSizeTrait};
21
use datafusion_common::cast::as_list_array;
22
use datafusion_common::utils::array_into_list_array_nullable;
23
use datafusion_common::ScalarValue;
24
use datafusion_expr_common::accumulator::Accumulator;
25
use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType};
26
use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewSet;
27
use std::fmt::Debug;
28
use std::sync::Arc;
29
30
/// Specialized implementation of
31
/// `COUNT DISTINCT` for [`StringArray`] [`LargeStringArray`],
32
/// [`BinaryArray`] and [`LargeBinaryArray`].
33
///
34
/// [`StringArray`]: arrow::array::StringArray
35
/// [`LargeStringArray`]: arrow::array::LargeStringArray
36
/// [`BinaryArray`]: arrow::array::BinaryArray
37
/// [`LargeBinaryArray`]: arrow::array::LargeBinaryArray
38
#[derive(Debug)]
39
pub struct BytesDistinctCountAccumulator<O: OffsetSizeTrait>(ArrowBytesSet<O>);
40
41
impl<O: OffsetSizeTrait> BytesDistinctCountAccumulator<O> {
42
0
    pub fn new(output_type: OutputType) -> Self {
43
0
        Self(ArrowBytesSet::new(output_type))
44
0
    }
45
}
46
47
impl<O: OffsetSizeTrait> Accumulator for BytesDistinctCountAccumulator<O> {
48
0
    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
49
0
        let set = self.0.take();
50
0
        let arr = set.into_state();
51
0
        let list = Arc::new(array_into_list_array_nullable(arr));
52
0
        Ok(vec![ScalarValue::List(list)])
53
0
    }
54
55
0
    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
56
0
        if values.is_empty() {
57
0
            return Ok(());
58
0
        }
59
0
60
0
        self.0.insert(&values[0]);
61
0
62
0
        Ok(())
63
0
    }
64
65
0
    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
66
0
        if states.is_empty() {
67
0
            return Ok(());
68
0
        }
69
0
        assert_eq!(
70
0
            states.len(),
71
            1,
72
0
            "count_distinct states must be single array"
73
        );
74
75
0
        let arr = as_list_array(&states[0])?;
76
0
        arr.iter().try_for_each(|maybe_list| {
77
0
            if let Some(list) = maybe_list {
78
0
                self.0.insert(&list);
79
0
            };
80
0
            Ok(())
81
0
        })
82
0
    }
83
84
0
    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
85
0
        Ok(ScalarValue::Int64(Some(self.0.non_null_len() as i64)))
86
0
    }
87
88
0
    fn size(&self) -> usize {
89
0
        std::mem::size_of_val(self) + self.0.size()
90
0
    }
91
}
92
93
/// Specialized implementation of
94
/// `COUNT DISTINCT` for [`StringViewArray`] and [`BinaryViewArray`].
95
///
96
/// [`StringViewArray`]: arrow::array::StringViewArray
97
/// [`BinaryViewArray`]: arrow::array::BinaryViewArray
98
#[derive(Debug)]
99
pub struct BytesViewDistinctCountAccumulator(ArrowBytesViewSet);
100
101
impl BytesViewDistinctCountAccumulator {
102
0
    pub fn new(output_type: OutputType) -> Self {
103
0
        Self(ArrowBytesViewSet::new(output_type))
104
0
    }
105
}
106
107
impl Accumulator for BytesViewDistinctCountAccumulator {
108
0
    fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> {
109
0
        let set = self.0.take();
110
0
        let arr = set.into_state();
111
0
        let list = Arc::new(array_into_list_array_nullable(arr));
112
0
        Ok(vec![ScalarValue::List(list)])
113
0
    }
114
115
0
    fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> {
116
0
        if values.is_empty() {
117
0
            return Ok(());
118
0
        }
119
0
120
0
        self.0.insert(&values[0]);
121
0
122
0
        Ok(())
123
0
    }
124
125
0
    fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> {
126
0
        if states.is_empty() {
127
0
            return Ok(());
128
0
        }
129
0
        assert_eq!(
130
0
            states.len(),
131
            1,
132
0
            "count_distinct states must be single array"
133
        );
134
135
0
        let arr = as_list_array(&states[0])?;
136
0
        arr.iter().try_for_each(|maybe_list| {
137
0
            if let Some(list) = maybe_list {
138
0
                self.0.insert(&list);
139
0
            };
140
0
            Ok(())
141
0
        })
142
0
    }
143
144
0
    fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> {
145
0
        Ok(ScalarValue::Int64(Some(self.0.non_null_len() as i64)))
146
0
    }
147
148
0
    fn size(&self) -> usize {
149
0
        std::mem::size_of_val(self) + self.0.size()
150
0
    }
151
}