From 2ea2c2720b0ebef2e5658c98f61c11b17e9d024b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 13:33:51 +0800 Subject: [PATCH 01/18] add type sig class Signed-off-by: jayzhan211 --- datafusion/expr-common/src/signature.rs | 40 +++++++++++--- .../expr/src/type_coercion/functions.rs | 54 +++++++++++-------- datafusion/functions/Cargo.toml | 1 + datafusion/functions/src/string/repeat.rs | 6 ++- datafusion/sqllogictest/test_files/expr.slt | 2 +- 5 files changed, 73 insertions(+), 30 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 3846fae5de5d..26b2de48984a 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -18,6 +18,8 @@ //! Signature module contains foundational types that are used to represent signatures, types, //! and return types of functions in DataFusion. +use std::fmt::Display; + use crate::type_coercion::aggregates::{NUMERICS, STRINGS}; use arrow::datatypes::DataType; use datafusion_common::types::{LogicalTypeRef, NativeType}; @@ -112,7 +114,7 @@ pub enum TypeSignature { /// For example, `Coercible(vec![logical_float64()])` accepts /// arguments like `vec![DataType::Int32]` or `vec![DataType::Float32]` /// since i32 and f32 can be casted to f64 - Coercible(Vec), + Coercible(Vec), /// Fixed number of arguments of arbitrary types /// If a function takes 0 argument, its `TypeSignature` should be `Any(0)` Any(usize), @@ -137,6 +139,22 @@ pub enum TypeSignature { String(usize), } +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] +pub enum TypeSignatureClass { + Timestamp, + // TODO: + // Interval + // Numeric + // Integer + Native(LogicalTypeRef), +} + +impl Display for TypeSignatureClass { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum ArrayFunctionSignature { /// Specialized Signature for ArrayAppend and similar functions @@ -163,7 +181,7 @@ pub enum ArrayFunctionSignature { MapArray, } -impl std::fmt::Display for ArrayFunctionSignature { +impl Display for ArrayFunctionSignature { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ArrayFunctionSignature::ArrayAndElement => { @@ -232,7 +250,7 @@ impl TypeSignature { } /// Helper function to join types with specified delimiter. - pub fn join_types(types: &[T], delimiter: &str) -> String { + pub fn join_types(types: &[T], delimiter: &str) -> String { types .iter() .map(|t| t.to_string()) @@ -267,7 +285,10 @@ impl TypeSignature { .collect(), TypeSignature::Coercible(types) => types .iter() - .map(|logical_type| get_data_types(logical_type.native())) + .map(|logical_type| match logical_type { + TypeSignatureClass::Native(l) => get_data_types(l.native()), + tsc => todo!("{tsc} not supported yet"), + }) .multi_cartesian_product() .collect(), TypeSignature::Variadic(types) => types @@ -400,7 +421,10 @@ impl Signature { } } /// Target coerce types in order - pub fn coercible(target_types: Vec, volatility: Volatility) -> Self { + pub fn coercible( + target_types: Vec, + volatility: Volatility, + ) -> Self { Self { type_signature: TypeSignature::Coercible(target_types), volatility, @@ -580,8 +604,10 @@ mod tests { ] ); - let type_signature = - TypeSignature::Coercible(vec![logical_string(), logical_int64()]); + let type_signature = TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Native(logical_int64()), + ]); let possible_types = type_signature.get_possible_types(); assert_eq!( possible_types, diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 5a4d89a0b2ec..fcd6d27238f7 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -28,7 +28,10 @@ use datafusion_common::{ Result, }; use datafusion_expr_common::{ - signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD}, + signature::{ + ArrayFunctionSignature, TypeSignatureClass, FIXED_SIZE_LIST_WILDCARD, + TIMEZONE_WILDCARD, + }, type_coercion::binary::string_coercion, }; use std::sync::Arc; @@ -521,35 +524,44 @@ fn get_valid_types( // Make sure the corresponding test is covered // If this function becomes COMPLEX, create another new signature! fn can_coerce_to( - logical_type: &NativeType, - target_type: &NativeType, - ) -> bool { - if logical_type == target_type { - return true; - } + current_type: &DataType, + target_type_class: &TypeSignatureClass, + ) -> Result { + let logical_type: NativeType = current_type.into(); - if logical_type == &NativeType::Null { - return true; - } + match target_type_class { + TypeSignatureClass::Native(native_type) => { + let target_type = native_type.native(); + if &logical_type == target_type { + return target_type.default_cast_for(current_type); + } + + if logical_type == NativeType::Null { + return target_type.default_cast_for(current_type); + } - if target_type.is_integer() && logical_type.is_integer() { - return true; + if target_type.is_integer() && logical_type.is_integer() { + return target_type.default_cast_for(current_type); + } + } + _ => { + todo!("") + } } - false + internal_err!( + "Expect {} but received {}", + target_type_class, + current_type + ) } let mut new_types = Vec::with_capacity(current_types.len()); - for (current_type, target_type) in + for (current_type, target_type_class) in current_types.iter().zip(target_types.iter()) { - let logical_type: NativeType = current_type.into(); - let target_logical_type = target_type.native(); - if can_coerce_to(&logical_type, target_logical_type) { - let target_type = - target_logical_type.default_cast_for(current_type)?; - new_types.push(target_type); - } + let target_type = can_coerce_to(current_type, target_type_class)?; + new_types.push(target_type); } vec![new_types] diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 70a988dbfefb..75aac699a770 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -73,6 +73,7 @@ blake3 = { version = "1.0", optional = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } +datafusion-expr-common = { workspace = true } datafusion-expr = { workspace = true } hashbrown = { workspace = true, optional = true } hex = { version = "0.4", optional = true } diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 249ce15d6dbe..d169c4e74e61 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -32,6 +32,7 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; +use datafusion_expr_common::signature::TypeSignatureClass; #[derive(Debug)] pub struct RepeatFunc { @@ -48,7 +49,10 @@ impl RepeatFunc { pub fn new() -> Self { Self { signature: Signature::coercible( - vec![logical_string(), logical_int64()], + vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Native(logical_int64()), + ], Volatility::Immutable, ), } diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index c653113fd438..d0e3eabdd6a3 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -555,7 +555,7 @@ select repeat('-1.2', arrow_cast(3, 'Int32')); ---- -1.2-1.2-1.2 -query error DataFusion error: Error during planning: Error during planning: Coercion from \[Utf8, Float64\] to the signature +query error DataFusion error: Error during planning: Internal error: Expect Native\(LogicalType\(Native\(Int64\), Int64\)\) but received Float64 select repeat('-1.2', 3.2); query T From 664edaa3965f9982dae50af12864f46fdb83942a Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 13:50:23 +0800 Subject: [PATCH 02/18] timestamp Signed-off-by: jayzhan211 --- datafusion/common/src/types/native.rs | 8 ++++++ datafusion/expr-common/src/signature.rs | 2 +- .../expr/src/type_coercion/functions.rs | 28 ++++++++++--------- .../functions/src/datetime/date_part.rs | 6 +++- 4 files changed, 29 insertions(+), 15 deletions(-) diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 7e326dc15bb2..a7cf784ea1a6 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -433,4 +433,12 @@ impl NativeType { UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 ) } + + #[inline] + pub fn is_timestamp(&self) -> bool { + matches!( + self, + NativeType::Timestamp(_, _) + ) + } } diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 26b2de48984a..808a34e5f1f8 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -151,7 +151,7 @@ pub enum TypeSignatureClass { impl Display for TypeSignatureClass { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{self:?}") + write!(f, "TypeSignatureClass::{self:?}") } } diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index fcd6d27238f7..8954b98a5a96 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -22,10 +22,7 @@ use arrow::{ datatypes::{DataType, TimeUnit}, }; use datafusion_common::{ - exec_err, internal_datafusion_err, internal_err, plan_err, - types::{LogicalType, NativeType}, - utils::{coerced_fixed_size_list_to_list, list_ndims}, - Result, + exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, types::{LogicalType, NativeType}, utils::{coerced_fixed_size_list_to_list, list_ndims}, Result }; use datafusion_expr_common::{ signature::{ @@ -533,27 +530,32 @@ fn get_valid_types( TypeSignatureClass::Native(native_type) => { let target_type = native_type.native(); if &logical_type == target_type { - return target_type.default_cast_for(current_type); + return target_type.default_cast_for(current_type) } if logical_type == NativeType::Null { - return target_type.default_cast_for(current_type); + return target_type.default_cast_for(current_type) } if target_type.is_integer() && logical_type.is_integer() { - return target_type.default_cast_for(current_type); + return target_type.default_cast_for(current_type) } + + internal_err!( + "Expect {} but received {}", + target_type_class, + current_type + ) + } + TypeSignatureClass::Timestamp if logical_type.is_timestamp() => { + NativeType::Timestamp(TimeUnit::Second, None).default_cast_for(current_type) } _ => { - todo!("") + not_impl_err!("{target_type_class}") } } - internal_err!( - "Expect {} but received {}", - target_type_class, - current_type - ) + } let mut new_types = Vec::with_capacity(current_types.len()); diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 01e094bc4e0b..2b81d6c5852d 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -36,12 +36,14 @@ use datafusion_common::cast::{ as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, as_timestamp_second_array, }; +use datafusion_common::types::logical_string; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature::{self, Exact}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, }; +use datafusion_expr_common::signature::TypeSignatureClass; #[derive(Debug)] pub struct DatePartFunc { @@ -60,6 +62,8 @@ impl DatePartFunc { Self { signature: Signature::one_of( vec![ + TypeSignature::Coercible(vec![TypeSignatureClass::Native(logical_string()), TypeSignatureClass::Timestamp]), + Exact(vec![Utf8, Timestamp(Nanosecond, None)]), Exact(vec![Utf8View, Timestamp(Nanosecond, None)]), Exact(vec![ From fc9921610ebaec46966a702a707e1d7b6ba059cd Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 14:58:09 +0800 Subject: [PATCH 03/18] date part Signed-off-by: jayzhan211 --- datafusion/common/src/types/native.rs | 25 +++++- datafusion/expr-common/src/signature.rs | 32 ++++++- .../expr/src/type_coercion/functions.rs | 37 +++++--- .../functions/src/datetime/date_part.rs | 86 ++++--------------- datafusion/sqllogictest/test_files/expr.slt | 22 +++-- .../sqllogictest/test_files/group_by.slt | 2 +- 6 files changed, 111 insertions(+), 93 deletions(-) diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index a7cf784ea1a6..391a546496b5 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -436,9 +436,26 @@ impl NativeType { #[inline] pub fn is_timestamp(&self) -> bool { - matches!( - self, - NativeType::Timestamp(_, _) - ) + matches!(self, NativeType::Timestamp(_, _)) + } + + #[inline] + pub fn is_date(&self) -> bool { + matches!(self, NativeType::Date) + } + + #[inline] + pub fn is_time(&self) -> bool { + matches!(self, NativeType::Time(_)) + } + + #[inline] + pub fn is_interval(&self) -> bool { + matches!(self, NativeType::Interval(_)) + } + + #[inline] + pub fn is_duration(&self) -> bool { + matches!(self, NativeType::Duration(_)) } } diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 808a34e5f1f8..05c9366a534f 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -21,7 +21,9 @@ use std::fmt::Display; use crate::type_coercion::aggregates::{NUMERICS, STRINGS}; -use arrow::datatypes::DataType; +use arrow:: + datatypes::{DataType, IntervalUnit, TimeUnit} +; use datafusion_common::types::{LogicalTypeRef, NativeType}; use itertools::Itertools; @@ -139,11 +141,21 @@ pub enum TypeSignature { String(usize), } +impl TypeSignature { + #[inline] + pub fn is_one_of(&self) -> bool { + matches!(self, TypeSignature::OneOf(_)) + } +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] pub enum TypeSignatureClass { Timestamp, + Date, + Time, + Interval, + Duration, // TODO: - // Interval // Numeric // Integer Native(LogicalTypeRef), @@ -287,7 +299,21 @@ impl TypeSignature { .iter() .map(|logical_type| match logical_type { TypeSignatureClass::Native(l) => get_data_types(l.native()), - tsc => todo!("{tsc} not supported yet"), + TypeSignatureClass::Timestamp => { + vec![DataType::Timestamp(TimeUnit::Nanosecond, None)] + } + TypeSignatureClass::Date => { + vec![DataType::Date64] + } + TypeSignatureClass::Time => { + vec![DataType::Time64(TimeUnit::Nanosecond)] + } + TypeSignatureClass::Interval => { + vec![DataType::Interval(IntervalUnit::DayTime)] + } + TypeSignatureClass::Duration => { + vec![DataType::Duration(TimeUnit::Nanosecond)] + } }) .multi_cartesian_product() .collect(), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 8954b98a5a96..28b76452587a 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -22,7 +22,10 @@ use arrow::{ datatypes::{DataType, TimeUnit}, }; use datafusion_common::{ - exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, types::{LogicalType, NativeType}, utils::{coerced_fixed_size_list_to_list, list_ndims}, Result + exec_err, internal_datafusion_err, internal_err, not_impl_err, plan_err, + types::{LogicalType, NativeType}, + utils::{coerced_fixed_size_list_to_list, list_ndims}, + Result, }; use datafusion_expr_common::{ signature::{ @@ -193,8 +196,12 @@ fn try_coerce_types( // Well-supported signature that returns exact valid types. if !valid_types.is_empty() && is_well_supported_signature(type_signature) { - // exact valid types - assert_eq!(valid_types.len(), 1); + // There may be many valid types if valid signature is OneOf + // Otherwise, there should be only one valid type + if !type_signature.is_one_of() { + assert_eq!(valid_types.len(), 1); + } + let valid_types = valid_types.swap_remove(0); if let Some(t) = maybe_data_types_without_coercion(&valid_types, current_types) { return Ok(t); @@ -530,15 +537,15 @@ fn get_valid_types( TypeSignatureClass::Native(native_type) => { let target_type = native_type.native(); if &logical_type == target_type { - return target_type.default_cast_for(current_type) + return target_type.default_cast_for(current_type); } if logical_type == NativeType::Null { - return target_type.default_cast_for(current_type) + return target_type.default_cast_for(current_type); } if target_type.is_integer() && logical_type.is_integer() { - return target_type.default_cast_for(current_type) + return target_type.default_cast_for(current_type); } internal_err!( @@ -548,14 +555,24 @@ fn get_valid_types( ) } TypeSignatureClass::Timestamp if logical_type.is_timestamp() => { - NativeType::Timestamp(TimeUnit::Second, None).default_cast_for(current_type) + Ok(current_type.to_owned()) + } + TypeSignatureClass::Date if logical_type.is_date() => { + Ok(current_type.to_owned()) + } + TypeSignatureClass::Time if logical_type.is_time() => { + Ok(current_type.to_owned()) + } + TypeSignatureClass::Interval if logical_type.is_interval() => { + Ok(current_type.to_owned()) + } + TypeSignatureClass::Duration if logical_type.is_duration() => { + Ok(current_type.to_owned()) } _ => { - not_impl_err!("{target_type_class}") + not_impl_err!("Got logical_type: {logical_type} with target_type_class: {target_type_class}") } } - - } let mut new_types = Vec::with_capacity(current_types.len()); diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 2b81d6c5852d..7ef6978469aa 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -23,10 +23,8 @@ use arrow::array::{Array, ArrayRef, Float64Array}; use arrow::compute::kernels::cast_utils::IntervalUnit; use arrow::compute::{binary, cast, date_part, DatePart}; use arrow::datatypes::DataType::{ - Date32, Date64, Duration, Float64, Interval, Time32, Time64, Timestamp, Utf8, - Utf8View, + Date32, Date64, Duration, Float64, Interval, Time32, Time64, Timestamp, }; -use arrow::datatypes::IntervalUnit::{DayTime, MonthDayNano, YearMonth}; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, TimeUnit}; @@ -39,9 +37,9 @@ use datafusion_common::cast::{ use datafusion_common::types::logical_string; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; -use datafusion_expr::TypeSignature::{self, Exact}; +use datafusion_expr::TypeSignature; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; use datafusion_expr_common::signature::TypeSignatureClass; @@ -62,74 +60,26 @@ impl DatePartFunc { Self { signature: Signature::one_of( vec![ - TypeSignature::Coercible(vec![TypeSignatureClass::Native(logical_string()), TypeSignatureClass::Timestamp]), - - Exact(vec![Utf8, Timestamp(Nanosecond, None)]), - Exact(vec![Utf8View, Timestamp(Nanosecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![ - Utf8View, - Timestamp(Nanosecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![Utf8, Timestamp(Millisecond, None)]), - Exact(vec![Utf8View, Timestamp(Millisecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), - ]), - Exact(vec![ - Utf8View, - Timestamp(Millisecond, Some(TIMEZONE_WILDCARD.into())), + TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Timestamp, ]), - Exact(vec![Utf8, Timestamp(Microsecond, None)]), - Exact(vec![Utf8View, Timestamp(Microsecond, None)]), - Exact(vec![ - Utf8, - Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), + TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Date, ]), - Exact(vec![ - Utf8View, - Timestamp(Microsecond, Some(TIMEZONE_WILDCARD.into())), + TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Time, ]), - Exact(vec![Utf8, Timestamp(Second, None)]), - Exact(vec![Utf8View, Timestamp(Second, None)]), - Exact(vec![ - Utf8, - Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), + TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Interval, ]), - Exact(vec![ - Utf8View, - Timestamp(Second, Some(TIMEZONE_WILDCARD.into())), + TypeSignature::Coercible(vec![ + TypeSignatureClass::Native(logical_string()), + TypeSignatureClass::Duration, ]), - Exact(vec![Utf8, Date64]), - Exact(vec![Utf8View, Date64]), - Exact(vec![Utf8, Date32]), - Exact(vec![Utf8View, Date32]), - Exact(vec![Utf8, Time32(Second)]), - Exact(vec![Utf8View, Time32(Second)]), - Exact(vec![Utf8, Time32(Millisecond)]), - Exact(vec![Utf8View, Time32(Millisecond)]), - Exact(vec![Utf8, Time64(Microsecond)]), - Exact(vec![Utf8View, Time64(Microsecond)]), - Exact(vec![Utf8, Time64(Nanosecond)]), - Exact(vec![Utf8View, Time64(Nanosecond)]), - Exact(vec![Utf8, Interval(YearMonth)]), - Exact(vec![Utf8View, Interval(YearMonth)]), - Exact(vec![Utf8, Interval(DayTime)]), - Exact(vec![Utf8View, Interval(DayTime)]), - Exact(vec![Utf8, Interval(MonthDayNano)]), - Exact(vec![Utf8View, Interval(MonthDayNano)]), - Exact(vec![Utf8, Duration(Second)]), - Exact(vec![Utf8View, Duration(Second)]), - Exact(vec![Utf8, Duration(Millisecond)]), - Exact(vec![Utf8View, Duration(Millisecond)]), - Exact(vec![Utf8, Duration(Microsecond)]), - Exact(vec![Utf8View, Duration(Microsecond)]), - Exact(vec![Utf8, Duration(Nanosecond)]), - Exact(vec![Utf8View, Duration(Nanosecond)]), ], Volatility::Immutable, ), diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index d0e3eabdd6a3..4e7dd4401f24 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -555,7 +555,7 @@ select repeat('-1.2', arrow_cast(3, 'Int32')); ---- -1.2-1.2-1.2 -query error DataFusion error: Error during planning: Internal error: Expect Native\(LogicalType\(Native\(Int64\), Int64\)\) but received Float64 +query error DataFusion error: Error during planning: Internal error: Expect TypeSignatureClass::Native\(LogicalType\(Native\(Int64\), Int64\)\) but received Float64 select repeat('-1.2', 3.2); query T @@ -1096,23 +1096,27 @@ SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123456780 -query R +# Second argument should not be string, failed in postgres too. +query error SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') + +query R +SELECT date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12.12345678 query R -SELECT date_part('millisecond', '2020-09-08T12:00:12.12345678+00:00') +SELECT date_part('millisecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123.45678 query R -SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00') +SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123456.78 query R -SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00') +SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123456780 @@ -1357,13 +1361,17 @@ SELECT date_part('second', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanose ---- 50.123456789 -query R +# Second argument should not be string, failed in postgres too +query error select extract(second from '2024-08-09T12:13:14') + +query R +select extract(second from timestamp '2024-08-09T12:13:14') ---- 14 query R -select extract(seconds from '2024-08-09T12:13:14') +select extract(seconds from timestamp '2024-08-09T12:13:14') ---- 14 diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 4b90ddf2ea5f..7da4c6107c81 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4281,7 +4281,7 @@ EXPLAIN SELECT extract(month from ts) as months logical_plan 01)Sort: months DESC NULLS FIRST, fetch=5 02)--Projection: date_part(Utf8("MONTH"),csv_with_timestamps.ts) AS months -03)----Aggregate: groupBy=[[date_part(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] +03)----Aggregate: groupBy=[[date_part(Utf8View("MONTH"), csv_with_timestamps.ts) AS date_part(Utf8("MONTH"),csv_with_timestamps.ts)]], aggr=[[]] 04)------TableScan: csv_with_timestamps projection=[ts] physical_plan 01)SortPreservingMergeExec: [months@0 DESC], fetch=5 From 2eac9f889581deb973a2874c48c4d80252ba7ebe Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 14:59:45 +0800 Subject: [PATCH 04/18] fmt Signed-off-by: jayzhan211 --- datafusion/expr-common/src/signature.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 05c9366a534f..f3da566b248a 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -21,9 +21,7 @@ use std::fmt::Display; use crate::type_coercion::aggregates::{NUMERICS, STRINGS}; -use arrow:: - datatypes::{DataType, IntervalUnit, TimeUnit} -; +use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; use datafusion_common::types::{LogicalTypeRef, NativeType}; use itertools::Itertools; From dd3fb7f6c26177e7148477761fac7df203ef7e1c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 15:02:26 +0800 Subject: [PATCH 05/18] taplo format Signed-off-by: jayzhan211 --- datafusion-cli/Cargo.lock | 1 + datafusion/functions/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 02bd01a49905..e6cd83d65d29 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1384,6 +1384,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "hashbrown 0.14.5", "hex", "itertools", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 75aac699a770..5872af7b40de 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -73,8 +73,8 @@ blake3 = { version = "1.0", optional = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } -datafusion-expr-common = { workspace = true } datafusion-expr = { workspace = true } +datafusion-expr-common = { workspace = true } hashbrown = { workspace = true, optional = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } From e114c864efc20a0692d7e0556817012ae8ea1b3b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 16:33:06 +0800 Subject: [PATCH 06/18] tpch test Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/tpch/q7.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/q8.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/q9.slt.part | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/q7.slt.part index a16af4710478..30dc40ebde93 100644 --- a/datafusion/sqllogictest/test_files/tpch/q7.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q7.slt.part @@ -62,7 +62,7 @@ logical_plan 02)--Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue 03)----Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] 04)------SubqueryAlias: shipping -05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume +05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8View("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume 06)----------Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8("FRANCE") AND n2.n_name = Utf8("GERMANY") OR n1.n_name = Utf8("GERMANY") AND n2.n_name = Utf8("FRANCE") 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name 08)--------------Inner Join: supplier.s_nationkey = n1.n_nationkey diff --git a/datafusion/sqllogictest/test_files/tpch/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/q8.slt.part index fd5773438466..67d5b393a401 100644 --- a/datafusion/sqllogictest/test_files/tpch/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q8.slt.part @@ -60,7 +60,7 @@ logical_plan 02)--Projection: all_nations.o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) AS Decimal128(12, 2)) / CAST(sum(all_nations.volume) AS Decimal128(12, 2)) AS Decimal128(15, 2)) AS mkt_share 03)----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] 04)------SubqueryAlias: all_nations -05)--------Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation +05)--------Projection: date_part(Utf8View("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation 06)----------Inner Join: n1.n_regionkey = region.r_regionkey 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name 08)--------------Inner Join: supplier.s_nationkey = n2.n_nationkey diff --git a/datafusion/sqllogictest/test_files/tpch/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/q9.slt.part index c4910beb842b..f6d9713953cb 100644 --- a/datafusion/sqllogictest/test_files/tpch/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q9.slt.part @@ -56,7 +56,7 @@ logical_plan 02)--Projection: profit.nation, profit.o_year, sum(profit.amount) AS sum_profit 03)----Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[sum(profit.amount)]] 04)------SubqueryAlias: profit -05)--------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount +05)--------Projection: nation.n_name AS nation, date_part(Utf8View("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount 06)----------Inner Join: supplier.s_nationkey = nation.n_nationkey 07)------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate 08)--------------Inner Join: lineitem.l_orderkey = orders.o_orderkey From f04aed55113b877ae098c4e42bde631460cb474e Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 17:04:30 +0800 Subject: [PATCH 07/18] msrc issue Signed-off-by: jayzhan211 --- datafusion/expr-common/src/signature.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index f3da566b248a..f65e7b1d26e2 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -146,7 +146,7 @@ impl TypeSignature { } } -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] +#[derive(Debug, Clone, Eq, PartialOrd, Hash)] pub enum TypeSignatureClass { Timestamp, Date, @@ -159,6 +159,15 @@ pub enum TypeSignatureClass { Native(LogicalTypeRef), } +impl PartialEq for TypeSignatureClass { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Native(l0), Self::Native(r0)) => l0 == r0, + _ => core::mem::discriminant(self) == core::mem::discriminant(other), + } + } +} + impl Display for TypeSignatureClass { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "TypeSignatureClass::{self:?}") From 3b8030c4260fc2b411df120ea4fd22a3ffd588d4 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 17:04:58 +0800 Subject: [PATCH 08/18] msrc issue Signed-off-by: jayzhan211 --- datafusion/expr-common/src/signature.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index f65e7b1d26e2..589c8f30f38c 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -159,6 +159,7 @@ pub enum TypeSignatureClass { Native(LogicalTypeRef), } +// TODO: MSRV issue: Default macro doesn't work in 1.79. Use default PartialEq macro after it is able to compile impl PartialEq for TypeSignatureClass { fn eq(&self, other: &Self) -> bool { match (self, other) { From 6b1e08ac14acd9b7ec651dad91e80245504301e8 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 12 Nov 2024 17:50:46 +0800 Subject: [PATCH 09/18] explicit hash Signed-off-by: jayzhan211 --- datafusion/expr-common/src/signature.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 589c8f30f38c..a0ee00214693 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -146,7 +146,7 @@ impl TypeSignature { } } -#[derive(Debug, Clone, Eq, PartialOrd, Hash)] +#[derive(Debug, Clone, Eq, PartialOrd)] pub enum TypeSignatureClass { Timestamp, Date, @@ -169,6 +169,12 @@ impl PartialEq for TypeSignatureClass { } } +impl std::hash::Hash for TypeSignatureClass { + fn hash(&self, state: &mut H) { + core::mem::discriminant(self).hash(state); + } +} + impl Display for TypeSignatureClass { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "TypeSignatureClass::{self:?}") From 1b2a3fd92f9b3aeedf164cb6ab98686195a407bc Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sun, 8 Dec 2024 15:17:40 +0800 Subject: [PATCH 10/18] Enhance type coercion and function signatures - Added logic to prevent unnecessary casting of string types in `native.rs`. - Introduced `Comparable` variant in `TypeSignature` to define coercion rules for comparisons. - Updated imports in `functions.rs` and `signature.rs` for better organization. - Modified `date_part.rs` to improve handling of timestamp extraction and fixed query tests in `expr.slt`. - Added `datafusion-macros` dependency in `Cargo.toml` and `Cargo.lock`. These changes improve type handling and ensure more accurate function behavior in SQL expressions. --- datafusion-cli/Cargo.lock | 1 + datafusion/common/src/types/native.rs | 2 ++ datafusion/expr-common/src/signature.rs | 11 +++++++- .../expr/src/type_coercion/functions.rs | 1 + datafusion/functions/Cargo.toml | 1 + .../functions/src/datetime/date_part.rs | 24 +++++++++-------- datafusion/sqllogictest/test_files/expr.slt | 26 +++++++++---------- 7 files changed, 40 insertions(+), 26 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index e6cd83d65d29..64b5929cf576 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1385,6 +1385,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-expr-common", + "datafusion-macros", "hashbrown 0.14.5", "hex", "itertools", diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 391a546496b5..c5f180a15035 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -245,6 +245,8 @@ impl LogicalType for NativeType { (Self::FixedSizeBinary(size), _) => FixedSizeBinary(*size), (Self::String, LargeBinary) => LargeUtf8, (Self::String, BinaryView) => Utf8View, + // We don't cast to another kind of string type if the origin one is already a string type + (Self::String, Utf8 | LargeUtf8 | Utf8View) => origin.to_owned(), (Self::String, data_type) if can_cast_types(data_type, &Utf8View) => Utf8View, (Self::String, data_type) if can_cast_types(data_type, &LargeUtf8) => { LargeUtf8 diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index d261dc2879f5..d9a833217158 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -20,7 +20,7 @@ use std::fmt::Display; -use crate::type_coercion::aggregates::{NUMERICS, STRINGS}; +use crate::type_coercion::aggregates::NUMERICS; use arrow::datatypes::{DataType, IntervalUnit, TimeUnit}; use datafusion_common::types::{LogicalTypeRef, NativeType}; use itertools::Itertools; @@ -115,6 +115,15 @@ pub enum TypeSignature { /// arguments like `vec![DataType::Int32]` or `vec![DataType::Float32]` /// since i32 and f32 can be casted to f64 Coercible(Vec), + /// The arguments will be coerced to a single type based on the comparison rules. + /// For example, i32 and i64 has coerced type Int64. + /// + /// Note: + /// - If compares with numeric and string, numeric is preferred for numeric string cases. For example, nullif('2', 1) has coerced types Int64. + /// - If the result is Null, it will be coerced to String (Utf8View). + /// + /// See `comparison_coercion_numeric` for more details. + Comparable(usize), /// Fixed number of arguments of arbitrary types /// If a function takes 0 argument, its `TypeSignature` should be `Any(0)` Any(usize), diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index ed510ad6b43e..989761cba745 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -32,6 +32,7 @@ use datafusion_expr_common::{ ArrayFunctionSignature, TypeSignatureClass, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD, }, + type_coercion::binary::comparison_coercion_numeric, type_coercion::binary::string_coercion, }; use std::sync::Arc; diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 5872af7b40de..a5e630c77b87 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -75,6 +75,7 @@ datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } +datafusion-macros = { workspace = true } hashbrown = { workspace = true, optional = true } hex = { version = "0.4", optional = true } itertools = { workspace = true } diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 7ef6978469aa..2e831a8b9bf7 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -23,23 +23,25 @@ use arrow::array::{Array, ArrayRef, Float64Array}; use arrow::compute::kernels::cast_utils::IntervalUnit; use arrow::compute::{binary, cast, date_part, DatePart}; use arrow::datatypes::DataType::{ - Date32, Date64, Duration, Float64, Interval, Time32, Time64, Timestamp, + Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, }; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, TimeUnit}; -use datafusion_common::cast::{ - as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, - as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array, - as_timestamp_microsecond_array, as_timestamp_millisecond_array, - as_timestamp_nanosecond_array, as_timestamp_second_array, +use datafusion_common::{ + cast::{ + as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, + as_time32_second_array, as_time64_microsecond_array, as_time64_nanosecond_array, + as_timestamp_microsecond_array, as_timestamp_millisecond_array, + as_timestamp_nanosecond_array, as_timestamp_second_array, + }, + exec_err, internal_err, + types::logical_string, + ExprSchema, Result, ScalarValue, }; -use datafusion_common::types::logical_string; -use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; -use datafusion_expr::TypeSignature; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + scalar_doc_sections::DOC_SECTION_DATETIME, ColumnarValue, Documentation, Expr, + ScalarUDFImpl, Signature, TypeSignature, Volatility, }; use datafusion_expr_common::signature::TypeSignatureClass; diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 4e7dd4401f24..5cf65bd92243 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1100,25 +1100,23 @@ SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') query error SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') -query R +query I SELECT date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT date_part('millisecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12.12345678 -query R +query I SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123.45678 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 # test_date_part_time @@ -1362,15 +1360,15 @@ SELECT date_part('second', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanose 50.123456789 # Second argument should not be string, failed in postgres too -query error -select extract(second from '2024-08-09T12:13:14') +query error +select extract(second from '2024-08-09T12:13:14'); -query R -select extract(second from timestamp '2024-08-09T12:13:14') +query I +select extract(second from timestamp '2024-08-09T12:13:14'); ---- 14 -query R +query I select extract(seconds from timestamp '2024-08-09T12:13:14') ---- 14 From 1e43c90f38435794cd5fa8d48ee40986452f6b06 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sun, 8 Dec 2024 15:21:16 +0800 Subject: [PATCH 11/18] fix comment Signed-off-by: Jay Zhan --- datafusion/expr-common/src/signature.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 462e92c95a0d..dee8fc618354 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -124,8 +124,7 @@ pub enum TypeSignature { /// /// See `comparison_coercion_numeric` for more details. Comparable(usize), - /// Fixed number of arguments of arbitrary types - /// If a function takes 0 argument, its `TypeSignature` should be `Any(0)` + /// Fixed number of arguments of arbitrary types, number should be larger than 0 Any(usize), /// Matches exactly one of a list of [`TypeSignature`]s. Coercion is attempted to match /// the signatures in order, and stops after the first success, if any. From afe48d13956eb8decb14dd340ef6c30ae8d4a697 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sun, 8 Dec 2024 15:31:24 +0800 Subject: [PATCH 12/18] fix signature Signed-off-by: Jay Zhan --- datafusion/expr-common/src/signature.rs | 18 +----------------- datafusion/sqllogictest/test_files/expr.slt | 7 +------ .../sqllogictest/test_files/group_by.slt | 2 +- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index dee8fc618354..f2cde4527f0c 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -156,7 +156,7 @@ impl TypeSignature { } } -#[derive(Debug, Clone, Eq, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] pub enum TypeSignatureClass { Timestamp, Date, @@ -169,22 +169,6 @@ pub enum TypeSignatureClass { Native(LogicalTypeRef), } -// TODO: MSRV issue: Default macro doesn't work in 1.79. Use default PartialEq macro after it is able to compile -impl PartialEq for TypeSignatureClass { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::Native(l0), Self::Native(r0)) => l0 == r0, - _ => core::mem::discriminant(self) == core::mem::discriminant(other), - } - } -} - -impl std::hash::Hash for TypeSignatureClass { - fn hash(&self, state: &mut H) { - core::mem::discriminant(self).hash(state); - } -} - impl Display for TypeSignatureClass { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "TypeSignatureClass::{self:?}") diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index f7902af5d4b3..2a8ce39738c8 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1109,15 +1109,10 @@ SELECT date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12 -query I -SELECT date_part('millisecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12 - query I SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123 +12123456 query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index fc09b5242141..df7e21c2da44 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4281,7 +4281,7 @@ EXPLAIN SELECT extract(month from ts) as months logical_plan 01)Sort: months DESC NULLS FIRST, fetch=5 02)--Projection: date_part(Utf8("MONTH"),csv_with_timestamps.ts) AS months -03)----Aggregate: groupBy=[[date_part(Utf8View("MONTH"), csv_with_timestamps.ts) AS date_part(Utf8("MONTH"),csv_with_timestamps.ts)]], aggr=[[]] +03)----Aggregate: groupBy=[[date_part(Utf8("MONTH"), csv_with_timestamps.ts)]], aggr=[[]] 04)------TableScan: csv_with_timestamps projection=[ts] physical_plan 01)SortPreservingMergeExec: [months@0 DESC], fetch=5 From 13fb7ed84b5138a622c1279fcc0d9e00279a2608 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Mon, 9 Dec 2024 20:01:37 +0800 Subject: [PATCH 13/18] fix test Signed-off-by: Jay Zhan --- datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part index 217203fc4817..023af0ea9fbd 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q7.slt.part @@ -62,7 +62,7 @@ logical_plan 02)--Projection: shipping.supp_nation, shipping.cust_nation, shipping.l_year, sum(shipping.volume) AS revenue 03)----Aggregate: groupBy=[[shipping.supp_nation, shipping.cust_nation, shipping.l_year]], aggr=[[sum(shipping.volume)]] 04)------SubqueryAlias: shipping -05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8View("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume +05)--------Projection: n1.n_name AS supp_nation, n2.n_name AS cust_nation, date_part(Utf8("YEAR"), lineitem.l_shipdate) AS l_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume 06)----------Inner Join: customer.c_nationkey = n2.n_nationkey Filter: n1.n_name = Utf8("FRANCE") AND n2.n_name = Utf8("GERMANY") OR n1.n_name = Utf8("GERMANY") AND n2.n_name = Utf8("FRANCE") 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, lineitem.l_shipdate, customer.c_nationkey, n1.n_name 08)--------------Inner Join: supplier.s_nationkey = n1.n_nationkey diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part index 8d84d6c59652..2bcab40dc985 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q8.slt.part @@ -60,7 +60,7 @@ logical_plan 02)--Projection: all_nations.o_year, CAST(CAST(sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END) AS Decimal128(12, 2)) / CAST(sum(all_nations.volume) AS Decimal128(12, 2)) AS Decimal128(15, 2)) AS mkt_share 03)----Aggregate: groupBy=[[all_nations.o_year]], aggr=[[sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN all_nations.nation = Utf8("BRAZIL") THEN all_nations.volume ELSE Int64(0) END), sum(all_nations.volume)]] 04)------SubqueryAlias: all_nations -05)--------Projection: date_part(Utf8View("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation +05)--------Projection: date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS volume, n2.n_name AS nation 06)----------Inner Join: n1.n_regionkey = region.r_regionkey 07)------------Projection: lineitem.l_extendedprice, lineitem.l_discount, orders.o_orderdate, n1.n_regionkey, n2.n_name 08)--------------Inner Join: supplier.s_nationkey = n2.n_nationkey diff --git a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part index e45c300fd558..4a288893da95 100644 --- a/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/plans/q9.slt.part @@ -56,7 +56,7 @@ logical_plan 02)--Projection: profit.nation, profit.o_year, sum(profit.amount) AS sum_profit 03)----Aggregate: groupBy=[[profit.nation, profit.o_year]], aggr=[[sum(profit.amount)]] 04)------SubqueryAlias: profit -05)--------Projection: nation.n_name AS nation, date_part(Utf8View("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount +05)--------Projection: nation.n_name AS nation, date_part(Utf8("YEAR"), orders.o_orderdate) AS o_year, lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) - partsupp.ps_supplycost * lineitem.l_quantity AS amount 06)----------Inner Join: supplier.s_nationkey = nation.n_nationkey 07)------------Projection: lineitem.l_quantity, lineitem.l_extendedprice, lineitem.l_discount, supplier.s_nationkey, partsupp.ps_supplycost, orders.o_orderdate 08)--------------Inner Join: lineitem.l_orderkey = orders.o_orderkey From f141e899e82e730feb0ab6d40cc2643527c2a1b5 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Tue, 10 Dec 2024 20:10:32 +0800 Subject: [PATCH 14/18] Enhance type coercion for timestamps to allow implicit casting from strings. Update SQL logic tests to reflect changes in timestamp handling, including expected outputs for queries involving nanoseconds and seconds. --- datafusion/expr/src/type_coercion/functions.rs | 4 ++++ datafusion/sqllogictest/test_files/expr.slt | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 46d7f0c0f80d..1998596e47a9 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -598,6 +598,10 @@ fn get_valid_types( current_type ) } + // Not consistent with Postgres and DuckDB but to avoid regression we implicit cast string to timestamp + TypeSignatureClass::Timestamp if logical_type == NativeType::String => { + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + } TypeSignatureClass::Timestamp if logical_type.is_timestamp() => { Ok(current_type.to_owned()) } diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 2a8ce39738c8..876704d2a917 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1100,9 +1100,10 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') -# Second argument should not be string, failed in postgres too. query error SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') +---- +12 query I SELECT date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00') @@ -1337,9 +1338,10 @@ SELECT date_part('second', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanose ---- 50 -# Second argument should not be string, failed in postgres too query error select extract(second from '2024-08-09T12:13:14'); +---- +14 query I select extract(second from timestamp '2024-08-09T12:13:14'); From e89520e75a990aa07e986c3179c0f1ab5109e238 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Tue, 10 Dec 2024 20:20:55 +0800 Subject: [PATCH 15/18] Refactor type coercion logic for timestamps to improve readability and maintainability. Update the `TypeSignatureClass` documentation to clarify its purpose in function signatures, particularly regarding coercible types. This change enhances the handling of implicit casting from strings to timestamps. --- datafusion/expr-common/src/signature.rs | 10 +++++++++- datafusion/expr/src/type_coercion/functions.rs | 4 +++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index f2cde4527f0c..81e3732920f1 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -156,6 +156,14 @@ impl TypeSignature { } } +/// Represents the class of types that can be used in a function signature. +/// +/// This is used to specify what types are valid for function arguments in a more flexible way than +/// just listing specific DataTypes. For example, TypeSignatureClass::Timestamp matches any timestamp +/// type regardless of timezone or precision. +/// +/// Used primarily with TypeSignature::Coercible to define function signatures that can accept +/// arguments that can be coerced to a particular class of types. #[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] pub enum TypeSignatureClass { Timestamp, @@ -163,10 +171,10 @@ pub enum TypeSignatureClass { Time, Interval, Duration, + Native(LogicalTypeRef), // TODO: // Numeric // Integer - Native(LogicalTypeRef), } impl Display for TypeSignatureClass { diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 1998596e47a9..b12489167b8f 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -599,7 +599,9 @@ fn get_valid_types( ) } // Not consistent with Postgres and DuckDB but to avoid regression we implicit cast string to timestamp - TypeSignatureClass::Timestamp if logical_type == NativeType::String => { + TypeSignatureClass::Timestamp + if logical_type == NativeType::String => + { Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) } TypeSignatureClass::Timestamp if logical_type.is_timestamp() => { From 7920421c94753e3f46eb203273f39a314203a795 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Wed, 11 Dec 2024 09:56:47 +0800 Subject: [PATCH 16/18] Fix SQL logic tests to correct query error handling for timestamp functions. Updated expected outputs for `date_part` and `extract` functions to reflect proper behavior with nanoseconds and seconds. This change improves the accuracy of test cases in the `expr.slt` file. --- datafusion/sqllogictest/test_files/expr.slt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 876704d2a917..00d5f25af20d 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1100,7 +1100,7 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') -query error +query I SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') ---- 12 @@ -1338,7 +1338,7 @@ SELECT date_part('second', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanose ---- 50 -query error +query I select extract(second from '2024-08-09T12:13:14'); ---- 14 From 4a7404d071e993f6f19e8354575add57c26b8fb8 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Wed, 11 Dec 2024 10:00:56 +0800 Subject: [PATCH 17/18] Enhance timestamp handling in TypeSignature to support timezone specification. Updated the logic to include an additional DataType for timestamps with a timezone wildcard, improving flexibility in timestamp operations. --- datafusion/expr-common/src/signature.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 81e3732920f1..148ddac73a57 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -322,7 +322,13 @@ impl TypeSignature { .map(|logical_type| match logical_type { TypeSignatureClass::Native(l) => get_data_types(l.native()), TypeSignatureClass::Timestamp => { - vec![DataType::Timestamp(TimeUnit::Nanosecond, None)] + vec![ + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp( + TimeUnit::Nanosecond, + Some(TIMEZONE_WILDCARD.into()), + ), + ] } TypeSignatureClass::Date => { vec![DataType::Date64] From 830b2d55d63ca87584539c72653aea6dcadc7607 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Fri, 13 Dec 2024 18:51:10 +0800 Subject: [PATCH 18/18] Refactor date_part function: remove redundant imports and add missing not_impl_err import for better error handling --- datafusion/functions/src/datetime/date_part.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 60be8a2718d7..b43fcb6db706 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -28,6 +28,7 @@ use arrow::datatypes::DataType::{ use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; use arrow::datatypes::{DataType, TimeUnit}; +use datafusion_common::not_impl_err; use datafusion_common::{ cast::{ as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, @@ -39,11 +40,6 @@ use datafusion_common::{ types::logical_string, ExprSchema, Result, ScalarValue, }; -use datafusion_common::{ - exec_err, internal_err, not_impl_err, ExprSchema, Result, ScalarValue, -}; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ scalar_doc_sections::DOC_SECTION_DATETIME, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, TypeSignature, Volatility,