From a203c2b167123eba4a2f36df4f954a6fecf536a5 Mon Sep 17 00:00:00 2001 From: Ian Lai <108986288+Chen-Yuan-Lai@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:11:35 +0800 Subject: [PATCH] doc-gen: migrate scalar functions (datetime) documentation 1/2 (#13920) * doc-gen: migrate scalar functions (datetime) documentation 1/2 * fix: fix typo and update function docs --------- Co-authored-by: Cheng-Yuan-Lai --- .../functions/src/datetime/current_date.rs | 30 ++--- .../functions/src/datetime/current_time.rs | 30 ++--- datafusion/functions/src/datetime/date_bin.rs | 118 +++++++++--------- .../functions/src/datetime/date_part.rs | 78 ++++++------ .../functions/src/datetime/date_trunc.rs | 60 ++++----- .../functions/src/datetime/from_unixtime.rs | 49 ++++---- .../source/user-guide/sql/scalar_functions.md | 28 ++--- 7 files changed, 175 insertions(+), 218 deletions(-) diff --git a/datafusion/functions/src/datetime/current_date.rs b/datafusion/functions/src/datetime/current_date.rs index 97d97939d329..868cbe23d616 100644 --- a/datafusion/functions/src/datetime/current_date.rs +++ b/datafusion/functions/src/datetime/current_date.rs @@ -22,13 +22,21 @@ use arrow::datatypes::DataType::Date32; use chrono::{Datelike, NaiveDate}; use datafusion_common::{internal_err, Result, ScalarValue}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, }; -use std::sync::OnceLock; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = r#" +Returns the current UTC date. + +The `current_date()` return value is determined at query time and will return the same date, no matter when in the query plan the function executes. +"#, + syntax_example = "current_date()" +)] #[derive(Debug)] pub struct CurrentDateFunc { signature: Signature, @@ -105,22 +113,6 @@ impl ScalarUDFImpl for CurrentDateFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_current_date_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_current_date_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - r#" -Returns the current UTC date. - -The `current_date()` return value is determined at query time and will return the same date, no matter when in the query plan the function executes. -"#, - "current_date()") - .build() - }) -} diff --git a/datafusion/functions/src/datetime/current_time.rs b/datafusion/functions/src/datetime/current_time.rs index 1cd39e5777ea..142184508ec6 100644 --- a/datafusion/functions/src/datetime/current_time.rs +++ b/datafusion/functions/src/datetime/current_time.rs @@ -19,15 +19,23 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Time64; use arrow::datatypes::TimeUnit::Nanosecond; use std::any::Any; -use std::sync::OnceLock; use datafusion_common::{internal_err, Result, ScalarValue}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = r#" +Returns the current UTC time. + +The `current_time()` return value is determined at query time and will return the same time, no matter when in the query plan the function executes. +"#, + syntax_example = "current_time()" +)] #[derive(Debug)] pub struct CurrentTimeFunc { signature: Signature, @@ -93,22 +101,6 @@ impl ScalarUDFImpl for CurrentTimeFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_current_time_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_current_time_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - r#" -Returns the current UTC time. - -The `current_time()` return value is determined at query time and will return the same time, no matter when in the query plan the function executes. -"#, - "current_time()") - .build() - }) -} diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index bb3f2177b9a4..a2886936992f 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::temporal_conversions::NANOSECONDS; use arrow::array::types::{ @@ -37,10 +37,64 @@ use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, }; +use datafusion_macros::user_doc; use chrono::{DateTime, Datelike, Duration, Months, TimeDelta, Utc}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = r#" +Calculates time intervals and returns the start of the interval nearest to the specified timestamp. Use `date_bin` to downsample time series data by grouping rows into time-based "bins" or "windows" and applying an aggregate or selector function to each window. + +For example, if you "bin" or "window" data into 15 minute intervals, an input timestamp of `2023-01-01T18:18:18Z` will be updated to the start time of the 15 minute bin it is in: `2023-01-01T18:15:00Z`. +"#, + syntax_example = "date_bin(interval, expression, origin-timestamp)", + sql_example = r#"```sql +-- Bin the timestamp into 1 day intervals +> SELECT date_bin(interval '1 day', time) as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T00:00:00 | +| 2023-01-03T00:00:00 | ++---------------------+ +2 row(s) fetched. + +-- Bin the timestamp into 1 day intervals starting at 3AM on 2023-01-01 +> SELECT date_bin(interval '1 day', time, '2023-01-01T03:00:00') as bin +FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); ++---------------------+ +| bin | ++---------------------+ +| 2023-01-01T03:00:00 | +| 2023-01-03T03:00:00 | ++---------------------+ +2 row(s) fetched. +```"#, + argument(name = "interval", description = "Bin interval."), + argument( + name = "expression", + description = "Time expression to operate on. Can be a constant, column, or function." + ), + argument( + name = "origin-timestamp", + description = r#"Optional. Starting point used to determine bin boundaries. If not specified defaults 1970-01-01T00:00:00Z (the UNIX epoch in UTC). The following intervals are supported: + + - nanoseconds + - microseconds + - milliseconds + - seconds + - minutes + - hours + - days + - weeks + - months + - years + - century +"# + ) +)] #[derive(Debug)] pub struct DateBinFunc { signature: Signature, @@ -169,68 +223,10 @@ impl ScalarUDFImpl for DateBinFunc { } } fn documentation(&self) -> Option<&Documentation> { - Some(get_date_bin_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_date_bin_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - r#" -Calculates time intervals and returns the start of the interval nearest to the specified timestamp. Use `date_bin` to downsample time series data by grouping rows into time-based "bins" or "windows" and applying an aggregate or selector function to each window. - -For example, if you "bin" or "window" data into 15 minute intervals, an input timestamp of `2023-01-01T18:18:18Z` will be updated to the start time of the 15 minute bin it is in: `2023-01-01T18:15:00Z`. -"#, - "date_bin(interval, expression, origin-timestamp)") - .with_sql_example(r#"```sql --- Bin the timestamp into 1 day intervals -> SELECT date_bin(interval '1 day', time) as bin -FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); -+---------------------+ -| bin | -+---------------------+ -| 2023-01-01T00:00:00 | -| 2023-01-03T00:00:00 | -+---------------------+ -2 row(s) fetched. - --- Bin the timestamp into 1 day intervals starting at 3AM on 2023-01-01 -> SELECT date_bin(interval '1 day', time, '2023-01-01T03:00:00') as bin -FROM VALUES ('2023-01-01T18:18:18Z'), ('2023-01-03T19:00:03Z') t(time); -+---------------------+ -| bin | -+---------------------+ -| 2023-01-01T03:00:00 | -| 2023-01-03T03:00:00 | -+---------------------+ -2 row(s) fetched. -``` -"#) - .with_argument("interval", "Bin interval.") - .with_argument("expression", "Time expression to operate on. Can be a constant, column, or function.") - .with_argument("origin-timestamp", "Optional. Starting point used to determine bin boundaries. If not specified defaults 1970-01-01T00:00:00Z (the UNIX epoch in UTC). - -The following intervals are supported: - -- nanoseconds -- microseconds -- milliseconds -- seconds -- minutes -- hours -- days -- weeks -- months -- years -- century -") - .build() - }) -} - enum Interval { Nanoseconds(i64), Months(i64), diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 0f115563c8db..0f01b6a21b0a 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::str::FromStr; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::{Array, ArrayRef, Float64Array, Int32Array}; use arrow::compute::kernels::cast_utils::IntervalUnit; @@ -41,11 +41,42 @@ use datafusion_common::{ ExprSchema, Result, ScalarValue, }; use datafusion_expr::{ - scalar_doc_sections::DOC_SECTION_DATETIME, ColumnarValue, Documentation, Expr, - ScalarUDFImpl, Signature, TypeSignature, Volatility, + ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, TypeSignature, + Volatility, }; use datafusion_expr_common::signature::TypeSignatureClass; - +use datafusion_macros::user_doc; + +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = "Returns the specified part of the date as an integer.", + syntax_example = "date_part(part, expression)", + alternative_syntax = "extract(field FROM source)", + argument( + name = "part", + description = r#"Part of the date to return. The following date parts are supported: + + - year + - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) + - month + - week (week of the year) + - day (day of the month) + - hour + - minute + - second + - millisecond + - microsecond + - nanosecond + - dow (day of the week) + - doy (day of the year) + - epoch (seconds since Unix epoch) +"# + ), + argument( + name = "expression", + description = "Time expression to operate on. Can be a constant, column, or function." + ) +)] #[derive(Debug)] pub struct DatePartFunc { signature: Signature, @@ -190,7 +221,7 @@ impl ScalarUDFImpl for DatePartFunc { &self.aliases } fn documentation(&self) -> Option<&Documentation> { - Some(get_date_part_doc()) + self.doc() } } @@ -206,43 +237,6 @@ fn part_normalization(part: &str) -> &str { .unwrap_or(part) } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_date_part_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - "Returns the specified part of the date as an integer.", - "date_part(part, expression)") - .with_argument( - "part", - r#"Part of the date to return. The following date parts are supported: - - - year - - quarter (emits value in inclusive range [1, 4] based on which quartile of the year the date is in) - - month - - week (week of the year) - - day (day of the month) - - hour - - minute - - second - - millisecond - - microsecond - - nanosecond - - dow (day of the week) - - doy (day of the year) - - epoch (seconds since Unix epoch) -"#, - ) - .with_argument( - "expression", - "Time expression to operate on. Can be a constant, column, or function.", - ) - .with_alternative_syntax("extract(field FROM source)") - .build() - }) -} - /// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the /// result to a total number of seconds, milliseconds, microseconds or /// nanoseconds diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index b9f3bbf65973..4780f5f5b818 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::ops::{Add, Sub}; use std::str::FromStr; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::array::temporal_conversions::{ as_datetime_with_timezone, timestamp_ns_to_datetime, @@ -38,12 +38,35 @@ use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, }; +use datafusion_macros::user_doc; use chrono::{ DateTime, Datelike, Duration, LocalResult, NaiveDateTime, Offset, TimeDelta, Timelike, }; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = "Truncates a timestamp value to a specified precision.", + syntax_example = "date_trunc(precision, expression)", + argument( + name = "precision", + description = r#"Time precision to truncate to. The following precisions are supported: + + - year / YEAR + - quarter / QUARTER + - month / MONTH + - week / WEEK + - day / DAY + - hour / HOUR + - minute / MINUTE + - second / SECOND +"# + ), + argument( + name = "expression", + description = "Time expression to operate on. Can be a constant, column, or function." + ) +)] #[derive(Debug)] pub struct DateTruncFunc { signature: Signature, @@ -247,41 +270,10 @@ impl ScalarUDFImpl for DateTruncFunc { } } fn documentation(&self) -> Option<&Documentation> { - Some(get_date_trunc_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_date_trunc_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - "Truncates a timestamp value to a specified precision.", - "date_trunc(precision, expression)", - ) - .with_argument( - "precision", - r#"Time precision to truncate to. The following precisions are supported: - - - year / YEAR - - quarter / QUARTER - - month / MONTH - - week / WEEK - - day / DAY - - hour / HOUR - - minute / MINUTE - - second / SECOND -"#, - ) - .with_argument( - "expression", - "Time expression to operate on. Can be a constant, column, or function.", - ) - .build() - }) -} - fn _date_trunc_coarse(granularity: &str, value: Option) -> Result> where T: Datelike + Timelike + Sub + Copy, diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs index 374c744915f7..425da7ddac29 100644 --- a/datafusion/functions/src/datetime/from_unixtime.rs +++ b/datafusion/functions/src/datetime/from_unixtime.rs @@ -16,18 +16,36 @@ // under the License. use std::any::Any; -use std::sync::{Arc, OnceLock}; +use std::sync::Arc; use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Int64, Timestamp, Utf8}; use arrow::datatypes::TimeUnit::Second; use datafusion_common::{exec_err, internal_err, ExprSchema, Result, ScalarValue}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; +#[user_doc( + doc_section(label = "Time and Date Functions"), + description = "Converts an integer to RFC3339 timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`). Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`) return the corresponding timestamp.", + syntax_example = "from_unixtime(expression[, timezone])", + sql_example = r#"```sql +> select from_unixtime(1599572549, 'America/New_York'); ++-----------------------------------------------------------+ +| from_unixtime(Int64(1599572549),Utf8("America/New_York")) | ++-----------------------------------------------------------+ +| 2020-09-08T09:42:29-04:00 | ++-----------------------------------------------------------+ +```"#, + standard_argument(name = "expression",), + argument( + name = "timezone", + description = "Optional timezone to use when converting the integer to a timestamp. If not provided, the default timezone is UTC." + ) +)] #[derive(Debug)] pub struct FromUnixtimeFunc { signature: Signature, @@ -125,35 +143,10 @@ impl ScalarUDFImpl for FromUnixtimeFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_from_unixtime_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_from_unixtime_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_DATETIME, - "Converts an integer to RFC3339 timestamp format (`YYYY-MM-DDT00:00:00.000000000Z`). Integers and unsigned integers are interpreted as nanoseconds since the unix epoch (`1970-01-01T00:00:00Z`) return the corresponding timestamp.", - "from_unixtime(expression[, timezone])") - .with_standard_argument("expression", None) - .with_argument( - "timezone", - "Optional timezone to use when converting the integer to a timestamp. If not provided, the default timezone is UTC.", - ) - .with_sql_example(r#"```sql -> select from_unixtime(1599572549, 'America/New_York'); -+-----------------------------------------------------------+ -| from_unixtime(Int64(1599572549),Utf8("America/New_York")) | -+-----------------------------------------------------------+ -| 2020-09-08T09:42:29-04:00 | -+-----------------------------------------------------------+ -```"#) - .build() - }) -} - #[cfg(test)] mod test { use crate::datetime::from_unixtime::FromUnixtimeFunc; diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 081509165edf..56cc8e10fb1b 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1986,21 +1986,19 @@ date_bin(interval, expression, origin-timestamp) - **interval**: Bin interval. - **expression**: Time expression to operate on. Can be a constant, column, or function. -- **origin-timestamp**: Optional. Starting point used to determine bin boundaries. If not specified defaults 1970-01-01T00:00:00Z (the UNIX epoch in UTC). - -The following intervals are supported: - -- nanoseconds -- microseconds -- milliseconds -- seconds -- minutes -- hours -- days -- weeks -- months -- years -- century +- **origin-timestamp**: Optional. Starting point used to determine bin boundaries. If not specified defaults 1970-01-01T00:00:00Z (the UNIX epoch in UTC). The following intervals are supported: + + - nanoseconds + - microseconds + - milliseconds + - seconds + - minutes + - hours + - days + - weeks + - months + - years + - century #### Example