From 6ada9f7c725b34c19533eed1091a8b51473855e1 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 28 Jun 2024 17:04:49 -0400 Subject: [PATCH] Consolidate make_date --- datafusion-examples/examples/dataframe.rs | 51 +++++++-- datafusion-examples/examples/make_date.rs | 120 ---------------------- datafusion-examples/examples/sql.rs | 84 ++++++++++++++- 3 files changed, 125 insertions(+), 130 deletions(-) delete mode 100644 datafusion-examples/examples/make_date.rs diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index 442f0b694aba..8543b74abee9 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -20,28 +20,28 @@ //! //! * [`parquet`]: query a single Parquet file //! * [`to_date_demo`]: use the `to_date` function to convert dates to strings +//! * [`to_timestamp_demo`]: use the `to_timestamp` function to convert strings to timestamps +//! * [`make_date_demo`]: use the `make_date` function to create dates from year, month, and day - +use arrow::array::{Int32Array, RecordBatch, StringArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::error::Result; use datafusion::prelude::*; use std::fs::File; use std::io::Write; use std::sync::Arc; -use arrow::array::{RecordBatch, StringArray}; use tempfile::tempdir; -use datafusion_common::assert_contains; #[tokio::main] async fn main() -> Result<()> { parquet().await?; to_date_demo().await?; to_timestamp_demo().await?; + make_date_demo().await?; Ok(()) } - /// This example demonstrates executing a simple query against an Arrow data /// source (Parquet) and fetching results, using the DataFrame trait @@ -130,7 +130,6 @@ async fn example_read_csv_file_with_schema(file_path: &str) -> DataFrame { ctx.read_csv(file_path, csv_read_option).await.unwrap() } - /// This example demonstrates how to use the to_date series /// of functions in the DataFrame API async fn to_date_demo() -> Result<()> { @@ -166,8 +165,6 @@ async fn to_date_demo() -> Result<()> { Ok(()) } - - /// This example demonstrates how to use the to_timestamp series /// of functions in the DataFrame API async fn to_timestamp_demo() -> Result<()> { @@ -224,3 +221,43 @@ async fn to_timestamp_demo() -> Result<()> { Ok(()) } + +/// This example demonstrates how to use the make_date +/// function in the DataFrame API as well as via sql. +async fn make_date_demo() -> Result<()> { + // define a schema. + let schema = Arc::new(Schema::new(vec![ + Field::new("y", DataType::Int32, false), + Field::new("m", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ])); + + // define data. + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2020, 2021, 2022, 2023, 2024])), + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(Int32Array::from(vec![15, 16, 17, 18, 19])), + ], + )?; + + // declare a new context. In spark API, this corresponds to a new spark SQLsession + let ctx = SessionContext::new(); + + // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + let df = ctx.table("t").await?; + + // use make_date function to convert col 'y', 'm' & 'd' to a date + let df = df.with_column("a", make_date(col("y"), col("m"), col("d")))?; + // use make_date function to convert col 'y' & 'm' with a static day to a date + let df = df.with_column("b", make_date(col("y"), col("m"), lit(22)))?; + + let df = df.select_columns(&["a", "b"])?; + + // print the results + df.show().await?; + + Ok(()) +} diff --git a/datafusion-examples/examples/make_date.rs b/datafusion-examples/examples/make_date.rs deleted file mode 100644 index 98bbb21bbff8..000000000000 --- a/datafusion-examples/examples/make_date.rs +++ /dev/null @@ -1,120 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::sync::Arc; - -use datafusion::arrow::array::Int32Array; -use datafusion::arrow::datatypes::{DataType, Field, Schema}; -use datafusion::arrow::record_batch::RecordBatch; -use datafusion::error::Result; -use datafusion::prelude::*; -use datafusion_common::assert_contains; - -/// This example demonstrates how to use the make_date -/// function in the DataFrame API as well as via sql. -#[tokio::main] -async fn main() -> Result<()> { - // define a schema. - let schema = Arc::new(Schema::new(vec![ - Field::new("y", DataType::Int32, false), - Field::new("m", DataType::Int32, false), - Field::new("d", DataType::Int32, false), - ])); - - // define data. - let batch = RecordBatch::try_new( - schema, - vec![ - Arc::new(Int32Array::from(vec![2020, 2021, 2022, 2023, 2024])), - Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), - Arc::new(Int32Array::from(vec![15, 16, 17, 18, 19])), - ], - )?; - - // declare a new context. In spark API, this corresponds to a new spark SQLsession - let ctx = SessionContext::new(); - - // declare a table in memory. In spark API, this corresponds to createDataFrame(...). - ctx.register_batch("t", batch)?; - let df = ctx.table("t").await?; - - // use make_date function to convert col 'y', 'm' & 'd' to a date - let df = df.with_column("a", make_date(col("y"), col("m"), col("d")))?; - // use make_date function to convert col 'y' & 'm' with a static day to a date - let df = df.with_column("b", make_date(col("y"), col("m"), lit(22)))?; - - let df = df.select_columns(&["a", "b"])?; - - // print the results - df.show().await?; - - // use sql to convert col 'y', 'm' & 'd' to a date - let df = ctx.sql("select make_date(y, m, d) from t").await?; - - // print the results - df.show().await?; - - // use sql to convert col 'y' & 'm' with a static string day to a date - let df = ctx.sql("select make_date(y, m, '22') from t").await?; - - // print the results - df.show().await?; - - // math expressions work - let df = ctx.sql("select make_date(y + 1, m, d) from t").await?; - - // print the results - df.show().await?; - - // you can cast to supported types (int, bigint, varchar) if required - let df = ctx - .sql("select make_date(2024::bigint, 01::bigint, 27::varchar(3))") - .await?; - - // print the results - df.show().await?; - - // arrow casts also work - let df = ctx - .sql("select make_date(arrow_cast(2024, 'Int64'), arrow_cast(1, 'Int64'), arrow_cast(27, 'Int64'))") - .await?; - - // print the results - df.show().await?; - - // invalid column values will result in an error - let result = ctx - .sql("select make_date(2024, null, 23)") - .await? - .collect() - .await; - - let expected = "Execution error: Unable to parse date from null/empty value"; - assert_contains!(result.unwrap_err().to_string(), expected); - - // invalid date values will also result in an error - let result = ctx - .sql("select make_date(2024, 01, 32)") - .await? - .collect() - .await; - - let expected = "Execution error: Unable to parse date from 2024, 1, 32"; - assert_contains!(result.unwrap_err().to_string(), expected); - - Ok(()) -} diff --git a/datafusion-examples/examples/sql.rs b/datafusion-examples/examples/sql.rs index 9d10952bb0a6..1c54c8cc6b00 100644 --- a/datafusion-examples/examples/sql.rs +++ b/datafusion-examples/examples/sql.rs @@ -22,8 +22,9 @@ //! * [`regexp_demo`]: regular expression functions to manipulate strings //! * [`to_char_demo`]: to_char function to convert strings to date, time, timestamp and durations //! * [`to_timestamp_demo`]: to_timestamp function to convert strings to timestamps +//! * [`make_date_demo`]: make_date function to convert year, month and day to a date -use arrow::array::{Date32Array, RecordBatch, StringArray}; +use arrow::array::{Date32Array, Int32Array, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema}; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::listing::ListingOptions; @@ -41,6 +42,7 @@ async fn main() -> Result<()> { regexp_demo().await?; to_char_demo().await?; to_timestamp_demo().await?; + make_date_demo().await?; Ok(()) } @@ -674,9 +676,8 @@ async fn to_char_demo() -> Result<()> { Ok(()) } - /// This example demonstrates how to use the to_timestamp series -/// of functions in the DataFrame API as well as via sql. +/// of functions via sql. async fn to_timestamp_demo() -> Result<()> { // define a schema. let schema = Arc::new(Schema::new(vec![ @@ -766,3 +767,80 @@ async fn to_timestamp_demo() -> Result<()> { Ok(()) } +/// This example demonstrates how to use the make_date +/// function via sql. +async fn make_date_demo() -> Result<()> { + // define a schema. + let schema = Arc::new(Schema::new(vec![ + Field::new("y", DataType::Int32, false), + Field::new("m", DataType::Int32, false), + Field::new("d", DataType::Int32, false), + ])); + + // define data. + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2020, 2021, 2022, 2023, 2024])), + Arc::new(Int32Array::from(vec![1, 2, 3, 4, 5])), + Arc::new(Int32Array::from(vec![15, 16, 17, 18, 19])), + ], + )?; + + // declare a new context. In spark API, this corresponds to a new spark SQLsession + let ctx = SessionContext::new(); + + // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + ctx.register_batch("t", batch)?; + let df = ctx.table("t").await?; + + // use sql to convert col 'y' & 'm' with a static string day to a date + let df = ctx.sql("select make_date(y, m, '22') from t").await?; + + // print the results + df.show().await?; + + // math expressions work + let df = ctx.sql("select make_date(y + 1, m, d) from t").await?; + + // print the results + df.show().await?; + + // you can cast to supported types (int, bigint, varchar) if required + let df = ctx + .sql("select make_date(2024::bigint, 01::bigint, 27::varchar(3))") + .await?; + + // print the results + df.show().await?; + + // arrow casts also work + let df = ctx + .sql("select make_date(arrow_cast(2024, 'Int64'), arrow_cast(1, 'Int64'), arrow_cast(27, 'Int64'))") + .await?; + + // print the results + df.show().await?; + + // invalid column values will result in an error + let result = ctx + .sql("select make_date(2024, null, 23)") + .await? + .collect() + .await; + + let expected = "Execution error: Unable to parse date from null/empty value"; + assert_contains!(result.unwrap_err().to_string(), expected); + + // invalid date values will also result in an error + let result = ctx + .sql("select make_date(2024, 01, 32)") + .await? + .collect() + .await; + + let expected = "Execution error: Unable to parse date from 2024, 1, 32"; + assert_contains!(result.unwrap_err().to_string(), expected); + + Ok(()) +}