From 530412fadc1cfa45c872f48f708dca1877d0d668 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 3 Oct 2024 12:28:30 -0400 Subject: [PATCH] Port / Add Documentation for `VarianceSample` and `VariancePopulation` --- datafusion/expr/src/udf_docs.rs | 13 +++++ .../functions-aggregate/src/variance.rs | 47 +++++++++++++++++-- .../user-guide/sql/aggregate_functions.md | 39 --------------- 3 files changed, 57 insertions(+), 42 deletions(-) diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index 280910b87199d..f3b31c00b3004 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -131,6 +131,7 @@ impl DocumentationBuilder { self } + /// Adds an argument to the documentation, pub fn with_argument( mut self, arg_name: impl Into, @@ -142,6 +143,18 @@ impl DocumentationBuilder { self } + /// Add a standard "expression" argument to the documentation + /// + /// This is a common argument for scalar UDFs that operate on an expression and is rendered like + /// + /// ```no-run + /// expression: + /// Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + /// ``` + pub fn with_expression_argument(self) -> Self { + self.with_argument("expression", "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + } + pub fn with_related_udf(mut self, related_udf: impl Into) -> Self { let mut related = self.related_udfs.unwrap_or_default(); related.push(related_udf.into()); diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs index 3648ec0d13127..31257392cb13e 100644 --- a/datafusion/functions-aggregate/src/variance.rs +++ b/datafusion/functions-aggregate/src/variance.rs @@ -18,22 +18,24 @@ //! [`VarianceSample`]: variance sample aggregations. //! [`VariancePopulation`]: variance population aggregations. -use std::{fmt::Debug, sync::Arc}; - use arrow::{ array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array}, buffer::NullBuffer, compute::kernels::cast, datatypes::{DataType, Field}, }; +use std::sync::OnceLock; +use std::{fmt::Debug, sync::Arc}; use datafusion_common::{ downcast_value, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, utils::format_state_name, - Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature, + Volatility, }; use datafusion_functions_aggregate_common::{ aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType, @@ -135,6 +137,26 @@ impl AggregateUDFImpl for VarianceSample { ) -> Result> { Ok(Box::new(VarianceGroupsAccumulator::new(StatsType::Sample))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_sample_doc()) + } +} + +static VARIANCE_SAMPLE_DOC: OnceLock = OnceLock::new(); + +fn get_variance_sample_doc() -> &'static Documentation { + VARIANCE_SAMPLE_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical sample variance of a set of numbers.", + ) + .with_syntax_example("var(expression)") + .with_expression_argument() + .build() + .unwrap() + }) } pub struct VariancePopulation { @@ -222,6 +244,25 @@ impl AggregateUDFImpl for VariancePopulation { StatsType::Population, ))) } + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_population_doc()) + } +} + +static VARIANCE_POPULATION_DOC: OnceLock = OnceLock::new(); + +fn get_variance_population_doc() -> &'static Documentation { + VARIANCE_POPULATION_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical population variance of a set of numbers.", + ) + .with_syntax_example("var_pop(expression)") + .with_expression_argument() + .build() + .unwrap() + }) } /// An accumulator to compute variance diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index edb0e1d0c9f09..5b7d5603c5512 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -349,45 +349,6 @@ stddev_samp(expression) #### Arguments -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var` - -Returns the statistical variance of a set of numbers. - -``` -var(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_pop` - -Returns the statistical population variance of a set of numbers. - -``` -var_pop(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_samp` - -Returns the statistical sample variance of a set of numbers. - -``` -var_samp(expression) -``` - -#### Arguments - - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.