Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/functions-aggregate/src/lib.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143
18
#![deny(clippy::clone_on_ref_ptr)]
19
20
//! Aggregate Function packages for [DataFusion].
21
//!
22
//! This crate contains a collection of various aggregate function packages for DataFusion,
23
//! implemented using the extension API. Users may wish to control which functions
24
//! are available to control the binary size of their application as well as
25
//! use dialect specific implementations of functions (e.g. Spark vs Postgres)
26
//!
27
//! Each package is implemented as a separate
28
//! module, activated by a feature flag.
29
//!
30
//! [DataFusion]: https://crates.io/crates/datafusion
31
//!
32
//! # Available Packages
33
//! See the list of [modules](#modules) in this crate for available packages.
34
//!
35
//! # Using A Package
36
//! You can register all functions in all packages using the [`register_all`] function.
37
//!
38
//! Each package also exports an `expr_fn` submodule to help create [`Expr`]s that invoke
39
//! functions using a fluent style. For example:
40
//!
41
//![`Expr`]: datafusion_expr::Expr
42
//!
43
//! # Implementing A New Package
44
//!
45
//! To add a new package to this crate, you should follow the model of existing
46
//! packages. The high level steps are:
47
//!
48
//! 1. Create a new module with the appropriate [AggregateUDF] implementations.
49
//!
50
//! 2. Use the macros in [`macros`] to create standard entry points.
51
//!
52
//! 3. Add a new feature to `Cargo.toml`, with any optional dependencies
53
//!
54
//! 4. Use the `make_package!` macro to expose the module when the
55
//!    feature is enabled.
56
57
#[macro_use]
58
pub mod macros;
59
60
pub mod approx_distinct;
61
pub mod array_agg;
62
pub mod correlation;
63
pub mod count;
64
pub mod covariance;
65
pub mod first_last;
66
pub mod hyperloglog;
67
pub mod median;
68
pub mod min_max;
69
pub mod regr;
70
pub mod stddev;
71
pub mod sum;
72
pub mod variance;
73
74
pub mod approx_median;
75
pub mod approx_percentile_cont;
76
pub mod approx_percentile_cont_with_weight;
77
pub mod average;
78
pub mod bit_and_or_xor;
79
pub mod bool_and_or;
80
pub mod grouping;
81
pub mod nth_value;
82
pub mod string_agg;
83
84
use crate::approx_percentile_cont::approx_percentile_cont_udaf;
85
use crate::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight_udaf;
86
use datafusion_common::Result;
87
use datafusion_execution::FunctionRegistry;
88
use datafusion_expr::AggregateUDF;
89
use log::debug;
90
use std::sync::Arc;
91
92
/// Fluent-style API for creating `Expr`s
93
pub mod expr_fn {
94
    pub use super::approx_distinct::approx_distinct;
95
    pub use super::approx_median::approx_median;
96
    pub use super::approx_percentile_cont::approx_percentile_cont;
97
    pub use super::approx_percentile_cont_with_weight::approx_percentile_cont_with_weight;
98
    pub use super::array_agg::array_agg;
99
    pub use super::average::avg;
100
    pub use super::bit_and_or_xor::bit_and;
101
    pub use super::bit_and_or_xor::bit_or;
102
    pub use super::bit_and_or_xor::bit_xor;
103
    pub use super::bool_and_or::bool_and;
104
    pub use super::bool_and_or::bool_or;
105
    pub use super::correlation::corr;
106
    pub use super::count::count;
107
    pub use super::count::count_distinct;
108
    pub use super::covariance::covar_pop;
109
    pub use super::covariance::covar_samp;
110
    pub use super::first_last::first_value;
111
    pub use super::first_last::last_value;
112
    pub use super::grouping::grouping;
113
    pub use super::median::median;
114
    pub use super::min_max::max;
115
    pub use super::min_max::min;
116
    pub use super::nth_value::nth_value;
117
    pub use super::regr::regr_avgx;
118
    pub use super::regr::regr_avgy;
119
    pub use super::regr::regr_count;
120
    pub use super::regr::regr_intercept;
121
    pub use super::regr::regr_r2;
122
    pub use super::regr::regr_slope;
123
    pub use super::regr::regr_sxx;
124
    pub use super::regr::regr_sxy;
125
    pub use super::regr::regr_syy;
126
    pub use super::stddev::stddev;
127
    pub use super::stddev::stddev_pop;
128
    pub use super::sum::sum;
129
    pub use super::variance::var_pop;
130
    pub use super::variance::var_sample;
131
}
132
133
/// Returns all default aggregate functions
134
pub fn all_default_aggregate_functions() -> Vec<Arc<AggregateUDF>> {
135
    vec![
136
        array_agg::array_agg_udaf(),
137
        first_last::first_value_udaf(),
138
        first_last::last_value_udaf(),
139
        covariance::covar_samp_udaf(),
140
        covariance::covar_pop_udaf(),
141
        correlation::corr_udaf(),
142
        sum::sum_udaf(),
143
        min_max::max_udaf(),
144
        min_max::min_udaf(),
145
        median::median_udaf(),
146
        count::count_udaf(),
147
        regr::regr_slope_udaf(),
148
        regr::regr_intercept_udaf(),
149
        regr::regr_count_udaf(),
150
        regr::regr_r2_udaf(),
151
        regr::regr_avgx_udaf(),
152
        regr::regr_avgy_udaf(),
153
        regr::regr_sxx_udaf(),
154
        regr::regr_syy_udaf(),
155
        regr::regr_sxy_udaf(),
156
        variance::var_samp_udaf(),
157
        variance::var_pop_udaf(),
158
        stddev::stddev_udaf(),
159
        stddev::stddev_pop_udaf(),
160
        approx_median::approx_median_udaf(),
161
        approx_distinct::approx_distinct_udaf(),
162
        approx_percentile_cont_udaf(),
163
        approx_percentile_cont_with_weight_udaf(),
164
        string_agg::string_agg_udaf(),
165
        bit_and_or_xor::bit_and_udaf(),
166
        bit_and_or_xor::bit_or_udaf(),
167
        bit_and_or_xor::bit_xor_udaf(),
168
        bool_and_or::bool_and_udaf(),
169
        bool_and_or::bool_or_udaf(),
170
        average::avg_udaf(),
171
        grouping::grouping_udaf(),
172
        nth_value::nth_value_udaf(),
173
    ]
174
}
175
176
/// Registers all enabled packages with a [`FunctionRegistry`]
177
pub fn register_all(registry: &mut dyn FunctionRegistry) -> Result<()> {
178
    let functions: Vec<Arc<AggregateUDF>> = all_default_aggregate_functions();
179
180
0
    functions.into_iter().try_for_each(|udf| {
181
0
        let existing_udaf = registry.register_udaf(udf)?;
182
0
        if let Some(existing_udaf) = existing_udaf {
183
0
            debug!("Overwrite existing UDAF: {}", existing_udaf.name());
184
0
        }
185
0
        Ok(()) as Result<()>
186
0
    })?;
187
188
    Ok(())
189
}
190
191
#[cfg(test)]
192
mod tests {
193
    use crate::all_default_aggregate_functions;
194
    use datafusion_common::Result;
195
    use std::collections::HashSet;
196
197
    #[test]
198
    fn test_no_duplicate_name() -> Result<()> {
199
        let mut names = HashSet::new();
200
        let migrated_functions = ["array_agg", "count", "max", "min"];
201
        for func in all_default_aggregate_functions() {
202
            // TODO: remove this
203
            // These functions are in intermediate migration state, skip them
204
            if migrated_functions.contains(&func.name().to_lowercase().as_str()) {
205
                continue;
206
            }
207
            assert!(
208
                names.insert(func.name().to_string().to_lowercase()),
209
                "duplicate function name: {}",
210
                func.name()
211
            );
212
            for alias in func.aliases() {
213
                assert!(
214
                    names.insert(alias.to_string().to_lowercase()),
215
                    "duplicate function name: {}",
216
                    alias
217
                );
218
            }
219
        }
220
        Ok(())
221
    }
222
}