/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate-common/src/aggregate/count_distinct/bytes.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! [`BytesDistinctCountAccumulator`] for Utf8/LargeUtf8/Binary/LargeBinary values |
19 | | |
20 | | use arrow::array::{ArrayRef, OffsetSizeTrait}; |
21 | | use datafusion_common::cast::as_list_array; |
22 | | use datafusion_common::utils::array_into_list_array_nullable; |
23 | | use datafusion_common::ScalarValue; |
24 | | use datafusion_expr_common::accumulator::Accumulator; |
25 | | use datafusion_physical_expr_common::binary_map::{ArrowBytesSet, OutputType}; |
26 | | use datafusion_physical_expr_common::binary_view_map::ArrowBytesViewSet; |
27 | | use std::fmt::Debug; |
28 | | use std::sync::Arc; |
29 | | |
30 | | /// Specialized implementation of |
31 | | /// `COUNT DISTINCT` for [`StringArray`] [`LargeStringArray`], |
32 | | /// [`BinaryArray`] and [`LargeBinaryArray`]. |
33 | | /// |
34 | | /// [`StringArray`]: arrow::array::StringArray |
35 | | /// [`LargeStringArray`]: arrow::array::LargeStringArray |
36 | | /// [`BinaryArray`]: arrow::array::BinaryArray |
37 | | /// [`LargeBinaryArray`]: arrow::array::LargeBinaryArray |
38 | | #[derive(Debug)] |
39 | | pub struct BytesDistinctCountAccumulator<O: OffsetSizeTrait>(ArrowBytesSet<O>); |
40 | | |
41 | | impl<O: OffsetSizeTrait> BytesDistinctCountAccumulator<O> { |
42 | 0 | pub fn new(output_type: OutputType) -> Self { |
43 | 0 | Self(ArrowBytesSet::new(output_type)) |
44 | 0 | } |
45 | | } |
46 | | |
47 | | impl<O: OffsetSizeTrait> Accumulator for BytesDistinctCountAccumulator<O> { |
48 | 0 | fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> { |
49 | 0 | let set = self.0.take(); |
50 | 0 | let arr = set.into_state(); |
51 | 0 | let list = Arc::new(array_into_list_array_nullable(arr)); |
52 | 0 | Ok(vec![ScalarValue::List(list)]) |
53 | 0 | } |
54 | | |
55 | 0 | fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> { |
56 | 0 | if values.is_empty() { |
57 | 0 | return Ok(()); |
58 | 0 | } |
59 | 0 |
|
60 | 0 | self.0.insert(&values[0]); |
61 | 0 |
|
62 | 0 | Ok(()) |
63 | 0 | } |
64 | | |
65 | 0 | fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> { |
66 | 0 | if states.is_empty() { |
67 | 0 | return Ok(()); |
68 | 0 | } |
69 | 0 | assert_eq!( |
70 | 0 | states.len(), |
71 | | 1, |
72 | 0 | "count_distinct states must be single array" |
73 | | ); |
74 | | |
75 | 0 | let arr = as_list_array(&states[0])?; |
76 | 0 | arr.iter().try_for_each(|maybe_list| { |
77 | 0 | if let Some(list) = maybe_list { |
78 | 0 | self.0.insert(&list); |
79 | 0 | }; |
80 | 0 | Ok(()) |
81 | 0 | }) |
82 | 0 | } |
83 | | |
84 | 0 | fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> { |
85 | 0 | Ok(ScalarValue::Int64(Some(self.0.non_null_len() as i64))) |
86 | 0 | } |
87 | | |
88 | 0 | fn size(&self) -> usize { |
89 | 0 | std::mem::size_of_val(self) + self.0.size() |
90 | 0 | } |
91 | | } |
92 | | |
93 | | /// Specialized implementation of |
94 | | /// `COUNT DISTINCT` for [`StringViewArray`] and [`BinaryViewArray`]. |
95 | | /// |
96 | | /// [`StringViewArray`]: arrow::array::StringViewArray |
97 | | /// [`BinaryViewArray`]: arrow::array::BinaryViewArray |
98 | | #[derive(Debug)] |
99 | | pub struct BytesViewDistinctCountAccumulator(ArrowBytesViewSet); |
100 | | |
101 | | impl BytesViewDistinctCountAccumulator { |
102 | 0 | pub fn new(output_type: OutputType) -> Self { |
103 | 0 | Self(ArrowBytesViewSet::new(output_type)) |
104 | 0 | } |
105 | | } |
106 | | |
107 | | impl Accumulator for BytesViewDistinctCountAccumulator { |
108 | 0 | fn state(&mut self) -> datafusion_common::Result<Vec<ScalarValue>> { |
109 | 0 | let set = self.0.take(); |
110 | 0 | let arr = set.into_state(); |
111 | 0 | let list = Arc::new(array_into_list_array_nullable(arr)); |
112 | 0 | Ok(vec![ScalarValue::List(list)]) |
113 | 0 | } |
114 | | |
115 | 0 | fn update_batch(&mut self, values: &[ArrayRef]) -> datafusion_common::Result<()> { |
116 | 0 | if values.is_empty() { |
117 | 0 | return Ok(()); |
118 | 0 | } |
119 | 0 |
|
120 | 0 | self.0.insert(&values[0]); |
121 | 0 |
|
122 | 0 | Ok(()) |
123 | 0 | } |
124 | | |
125 | 0 | fn merge_batch(&mut self, states: &[ArrayRef]) -> datafusion_common::Result<()> { |
126 | 0 | if states.is_empty() { |
127 | 0 | return Ok(()); |
128 | 0 | } |
129 | 0 | assert_eq!( |
130 | 0 | states.len(), |
131 | | 1, |
132 | 0 | "count_distinct states must be single array" |
133 | | ); |
134 | | |
135 | 0 | let arr = as_list_array(&states[0])?; |
136 | 0 | arr.iter().try_for_each(|maybe_list| { |
137 | 0 | if let Some(list) = maybe_list { |
138 | 0 | self.0.insert(&list); |
139 | 0 | }; |
140 | 0 | Ok(()) |
141 | 0 | }) |
142 | 0 | } |
143 | | |
144 | 0 | fn evaluate(&mut self) -> datafusion_common::Result<ScalarValue> { |
145 | 0 | Ok(ScalarValue::Int64(Some(self.0.non_null_len() as i64))) |
146 | 0 | } |
147 | | |
148 | 0 | fn size(&self) -> usize { |
149 | 0 | std::mem::size_of_val(self) + self.0.size() |
150 | 0 | } |
151 | | } |