From 57f79c03a8dee9d8bf8601bf555aa271746913fe Mon Sep 17 00:00:00 2001 From: Marko Grujic Date: Thu, 23 Feb 2023 17:02:52 +0100 Subject: [PATCH] Enable casting of string to timestamp with microsecond resolution (#3752) * Enable casting of string to timestamp with microsecond resolution * Enable string conversion to timestamp with second and millisecond resolution --- arrow-cast/src/cast.rs | 128 ++++++++++++++++++++++++++++++++--------- 1 file changed, 101 insertions(+), 27 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 49461b14c339..d49775c98211 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -166,6 +166,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Second, _) + | Timestamp(TimeUnit::Millisecond, _) + | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) ) => true, (Utf8, _) => to_type.is_numeric() && to_type != &Float16, @@ -179,6 +182,9 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time32(TimeUnit::Millisecond) | Time64(TimeUnit::Microsecond) | Time64(TimeUnit::Nanosecond) + | Timestamp(TimeUnit::Second, _) + | Timestamp(TimeUnit::Millisecond, _) + | Timestamp(TimeUnit::Microsecond, _) | Timestamp(TimeUnit::Nanosecond, _) ) => true, (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, @@ -1141,8 +1147,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } + Timestamp(TimeUnit::Second, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Millisecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Microsecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp_ns::(array, cast_options) + cast_string_to_timestamp::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -1182,8 +1197,17 @@ pub fn cast_with_options( Time64(TimeUnit::Nanosecond) => { cast_string_to_time64nanosecond::(array, cast_options) } + Timestamp(TimeUnit::Second, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Millisecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } + Timestamp(TimeUnit::Microsecond, _) => { + cast_string_to_timestamp::(array, cast_options) + } Timestamp(TimeUnit::Nanosecond, _) => { - cast_string_to_timestamp_ns::(array, cast_options) + cast_string_to_timestamp::(array, cast_options) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", @@ -2552,8 +2576,11 @@ fn cast_string_to_time64nanosecond( Ok(Arc::new(array) as ArrayRef) } -/// Casts generic string arrays to TimeStampNanosecondArray -fn cast_string_to_timestamp_ns( +/// Casts generic string arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) +fn cast_string_to_timestamp< + Offset: OffsetSizeTrait, + TimestampType: ArrowTimestampType, +>( array: &dyn Array, cast_options: &CastOptions, ) -> Result { @@ -2562,26 +2589,36 @@ fn cast_string_to_timestamp_ns( .downcast_ref::>() .unwrap(); + let scale_factor = match TimestampType::get_time_unit() { + TimeUnit::Second => 1_000_000_000, + TimeUnit::Millisecond => 1_000_000, + TimeUnit::Microsecond => 1_000, + TimeUnit::Nanosecond => 1, + }; + let array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok())); + let iter = string_array.iter().map(|v| { + v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t / scale_factor)) + }); // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) } + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } } else { let vec = string_array .iter() - .map(|v| v.map(string_to_timestamp_nanos).transpose()) + .map(|v| { + v.map(|v| string_to_timestamp_nanos(v).map(|t| t / scale_factor)) + .transpose() + }) .collect::>, _>>()?; // Benefit: // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) } + unsafe { PrimitiveArray::::from_trusted_len_iter(vec.iter()) } }; Ok(Arc::new(array) as ArrayRef) @@ -4704,32 +4741,69 @@ mod tests { #[test] fn test_cast_string_to_timestamp() { let a1 = Arc::new(StringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), + Some("2020-09-08T12:00:00.123456789+00:00"), Some("Not a valid date"), None, ])) as ArrayRef; let a2 = Arc::new(LargeStringArray::from(vec![ - Some("2020-09-08T12:00:00+00:00"), + Some("2020-09-08T12:00:00.123456789+00:00"), Some("Not a valid date"), None, ])) as ArrayRef; for array in &[a1, a2] { - let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None); - let b = cast(array, &to_type).unwrap(); - let c = b - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(1599566400000000000, c.value(0)); - assert!(c.is_null(1)); - assert!(c.is_null(2)); + for time_unit in &[ + TimeUnit::Second, + TimeUnit::Millisecond, + TimeUnit::Microsecond, + TimeUnit::Nanosecond, + ] { + let to_type = DataType::Timestamp(time_unit.clone(), None); + let b = cast(array, &to_type).unwrap(); + + match time_unit { + TimeUnit::Second => { + let c = + b.as_any().downcast_ref::().unwrap(); + assert_eq!(1599566400, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Millisecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Microsecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123456, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + TimeUnit::Nanosecond => { + let c = b + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(1599566400123456789, c.value(0)); + assert!(c.is_null(1)); + assert!(c.is_null(2)); + } + } - let options = CastOptions { safe: false }; - let err = cast_with_options(array, &to_type, &options).unwrap_err(); - assert_eq!( - err.to_string(), - "Cast error: Error parsing 'Not a valid date' as timestamp" - ); + let options = CastOptions { safe: false }; + let err = cast_with_options(array, &to_type, &options).unwrap_err(); + assert_eq!( + err.to_string(), + "Cast error: Error parsing 'Not a valid date' as timestamp" + ); + } } }