/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate/src/lib.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 |
18 | | #![deny(clippy::clone_on_ref_ptr)] |
19 | | |
20 | | //! Aggregate Function packages for [DataFusion]. |
21 | | //! |
22 | | //! This crate contains a collection of various aggregate function packages for DataFusion, |
23 | | //! implemented using the extension API. Users may wish to control which functions |
24 | | //! are available to control the binary size of their application as well as |
25 | | //! use dialect specific implementations of functions (e.g. Spark vs Postgres) |
26 | | //! |
27 | | //! Each package is implemented as a separate |
28 | | //! module, activated by a feature flag. |
29 | | //! |
30 | | //! [DataFusion]: https://crates.io/crates/datafusion |
31 | | //! |
32 | | //! # Available Packages |
33 | | //! See the list of [modules](#modules) in this crate for available packages. |
34 | | //! |
35 | | //! # Using A Package |
36 | | //! You can register all functions in all packages using the [`register_all`] function. |
37 | | //! |
38 | | //! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke |
39 | | //! functions using a fluent style. For example: |
40 | | //! |
41 | | //![`Expr`]: datafusion_expr::Expr |
42 | | //! |
43 | | //! # Implementing A New Package |
44 | | //! |
45 | | //! To add a new package to this crate, you should follow the model of existing |
46 | | //! packages. The high level steps are: |
47 | | //! |
48 | | //! 1. Create a new module with the appropriate [AggregateUDF] implementations. |
49 | | //! |
50 | | //! 2. Use the macros in [`macros`] to create standard entry points. |
51 | | //! |
52 | | //! 3. Add a new feature to `Cargo.toml`, with any optional dependencies |
53 | | //! |
54 | | //! 4. Use the `make_package!` macro to expose the module when the |
55 | | //! feature is enabled. |
56 | | |
57 | | #[macro_use] |
58 | | pub mod macros; |
59 | | |
60 | | pub mod approx_distinct; |
61 | | pub mod array_agg; |
62 | | pub mod correlation; |
63 | | pub mod count; |
64 | | pub mod covariance; |
65 | | pub mod first_last; |
66 | | pub mod hyperloglog; |
67 | | pub mod median; |
68 | | pub mod min_max; |
69 | | pub mod regr; |
70 | | pub mod stddev; |
71 | | pub mod sum; |
72 | | pub mod variance; |
73 | | |
74 | | pub mod approx_median; |
75 | | pub mod approx_percentile_cont; |
76 | | pub mod approx_percentile_cont_with_weight; |
77 | | pub mod average; |
78 | | pub mod bit_and_or_xor; |
79 | | pub mod bool_and_or; |
80 | | pub mod grouping; |
81 | | pub mod nth_value; |
82 | | pub mod string_agg; |
83 | | |
84 | | use crate::approx_percentile_cont::approx_percentile_cont_udaf; |
85 | | use crate::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight_udaf; |
86 | | use datafusion_common::Result; |
87 | | use datafusion_execution::FunctionRegistry; |
88 | | use datafusion_expr::AggregateUDF; |
89 | | use log::debug; |
90 | | use std::sync::Arc; |
91 | | |
92 | | /// Fluent-style API for creating `Expr`s |
93 | | pub mod expr_fn { |
94 | | pub use super::approx_distinct::approx_distinct; |
95 | | pub use super::approx_median::approx_median; |
96 | | pub use super::approx_percentile_cont::approx_percentile_cont; |
97 | | pub use super::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight; |
98 | | pub use super::array_agg::array_agg; |
99 | | pub use super::average::avg; |
100 | | pub use super::bit_and_or_xor::bit_and; |
101 | | pub use super::bit_and_or_xor::bit_or; |
102 | | pub use super::bit_and_or_xor::bit_xor; |
103 | | pub use super::bool_and_or::bool_and; |
104 | | pub use super::bool_and_or::bool_or; |
105 | | pub use super::correlation::corr; |
106 | | pub use super::count::count; |
107 | | pub use super::count::count_distinct; |
108 | | pub use super::covariance::covar_pop; |
109 | | pub use super::covariance::covar_samp; |
110 | | pub use super::first_last::first_value; |
111 | | pub use super::first_last::last_value; |
112 | | pub use super::grouping::grouping; |
113 | | pub use super::median::median; |
114 | | pub use super::min_max::max; |
115 | | pub use super::min_max::min; |
116 | | pub use super::nth_value::nth_value; |
117 | | pub use super::regr::regr_avgx; |
118 | | pub use super::regr::regr_avgy; |
119 | | pub use super::regr::regr_count; |
120 | | pub use super::regr::regr_intercept; |
121 | | pub use super::regr::regr_r2; |
122 | | pub use super::regr::regr_slope; |
123 | | pub use super::regr::regr_sxx; |
124 | | pub use super::regr::regr_sxy; |
125 | | pub use super::regr::regr_syy; |
126 | | pub use super::stddev::stddev; |
127 | | pub use super::stddev::stddev_pop; |
128 | | pub use super::sum::sum; |
129 | | pub use super::variance::var_pop; |
130 | | pub use super::variance::var_sample; |
131 | | } |
132 | | |
133 | | /// Returns all default aggregate functions |
134 | | pub fn all_default_aggregate_functions() -> Vec<Arc<AggregateUDF>> { |
135 | | vec![ |
136 | | array_agg::array_agg_udaf(), |
137 | | first_last::first_value_udaf(), |
138 | | first_last::last_value_udaf(), |
139 | | covariance::covar_samp_udaf(), |
140 | | covariance::covar_pop_udaf(), |
141 | | correlation::corr_udaf(), |
142 | | sum::sum_udaf(), |
143 | | min_max::max_udaf(), |
144 | | min_max::min_udaf(), |
145 | | median::median_udaf(), |
146 | | count::count_udaf(), |
147 | | regr::regr_slope_udaf(), |
148 | | regr::regr_intercept_udaf(), |
149 | | regr::regr_count_udaf(), |
150 | | regr::regr_r2_udaf(), |
151 | | regr::regr_avgx_udaf(), |
152 | | regr::regr_avgy_udaf(), |
153 | | regr::regr_sxx_udaf(), |
154 | | regr::regr_syy_udaf(), |
155 | | regr::regr_sxy_udaf(), |
156 | | variance::var_samp_udaf(), |
157 | | variance::var_pop_udaf(), |
158 | | stddev::stddev_udaf(), |
159 | | stddev::stddev_pop_udaf(), |
160 | | approx_median::approx_median_udaf(), |
161 | | approx_distinct::approx_distinct_udaf(), |
162 | | approx_percentile_cont_udaf(), |
163 | | approx_percentile_cont_with_weight_udaf(), |
164 | | string_agg::string_agg_udaf(), |
165 | | bit_and_or_xor::bit_and_udaf(), |
166 | | bit_and_or_xor::bit_or_udaf(), |
167 | | bit_and_or_xor::bit_xor_udaf(), |
168 | | bool_and_or::bool_and_udaf(), |
169 | | bool_and_or::bool_or_udaf(), |
170 | | average::avg_udaf(), |
171 | | grouping::grouping_udaf(), |
172 | | nth_value::nth_value_udaf(), |
173 | | ] |
174 | | } |
175 | | |
176 | | /// Registers all enabled packages with a [`FunctionRegistry`] |
177 | | pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> { |
178 | | let functions: Vec<Arc<AggregateUDF>> = all_default_aggregate_functions(); |
179 | | |
180 | 0 | functions.into_iter().try_for_each(|udf| { |
181 | 0 | let existing_udaf = registry.register_udaf(udf)?; |
182 | 0 | if let Some(existing_udaf) = existing_udaf { |
183 | 0 | debug!("Overwrite existing UDAF: {}", existing_udaf.name()); |
184 | 0 | } |
185 | 0 | Ok(()) as Result<()> |
186 | 0 | })?; |
187 | | |
188 | | Ok(()) |
189 | | } |
190 | | |
191 | | #[cfg(test)] |
192 | | mod tests { |
193 | | use crate::all_default_aggregate_functions; |
194 | | use datafusion_common::Result; |
195 | | use std::collections::HashSet; |
196 | | |
197 | | #[test] |
198 | | fn test_no_duplicate_name() -> Result<()> { |
199 | | let mut names = HashSet::new(); |
200 | | let migrated_functions = ["array_agg", "count", "max", "min"]; |
201 | | for func in all_default_aggregate_functions() { |
202 | | // TODO: remove this |
203 | | // These functions are in intermediate migration state, skip them |
204 | | if migrated_functions.contains(&func.name().to_lowercase().as_str()) { |
205 | | continue; |
206 | | } |
207 | | assert!( |
208 | | names.insert(func.name().to_string().to_lowercase()), |
209 | | "duplicate function name: {}", |
210 | | func.name() |
211 | | ); |
212 | | for alias in func.aliases() { |
213 | | assert!( |
214 | | names.insert(alias.to_string().to_lowercase()), |
215 | | "duplicate function name: {}", |
216 | | alias |
217 | | ); |
218 | | } |
219 | | } |
220 | | Ok(()) |
221 | | } |
222 | | } |