From 8a5be1330e30e6dd7760dba910737550d760e612 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Fri, 19 Jul 2024 20:17:13 +0100 Subject: [PATCH] Enable casting from Utf8View (#6077) * Enable casting from Utf8View -> string or temporal types * save * implement casting utf8view -> timestamp/interval types, with tests * fix clippy * fmt --------- Co-authored-by: Andrew Lamb --- arrow-cast/src/cast/mod.rs | 150 ++++++++++++++++++++++------ arrow-cast/src/cast/string.rs | 177 +++++++++++++++++++++++++++------- 2 files changed, 265 insertions(+), 62 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index c9de714e7d55..1770157bcfd9 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -210,7 +210,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true, (FixedSizeBinary(_), Binary | LargeBinary) => true, ( - Utf8 | LargeUtf8, + Utf8 | LargeUtf8 | Utf8View, Binary | LargeBinary | Utf8 @@ -228,7 +228,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Interval(_), ) => true, (Utf8 | LargeUtf8, Utf8View) => true, - (Utf8View, Utf8 | LargeUtf8) => true, (BinaryView, Binary | LargeBinary) => true, (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), @@ -1269,6 +1268,56 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, + (Utf8View, _) => match to_type { + UInt8 => parse_string_view::(array, cast_options), + UInt16 => parse_string_view::(array, cast_options), + UInt32 => parse_string_view::(array, cast_options), + UInt64 => parse_string_view::(array, cast_options), + Int8 => parse_string_view::(array, cast_options), + Int16 => parse_string_view::(array, cast_options), + Int32 => parse_string_view::(array, cast_options), + Int64 => parse_string_view::(array, cast_options), + Float32 => parse_string_view::(array, cast_options), + Float64 => parse_string_view::(array, cast_options), + Date32 => parse_string_view::(array, cast_options), + Date64 => parse_string_view::(array, cast_options), + Binary => cast_view_to_byte::>(array), + LargeBinary => cast_view_to_byte::>(array), + Utf8 => cast_view_to_byte::>(array), + LargeUtf8 => cast_view_to_byte::>(array), + Time32(TimeUnit::Second) => parse_string_view::(array, cast_options), + Time32(TimeUnit::Millisecond) => { + parse_string_view::(array, cast_options) + } + Time64(TimeUnit::Microsecond) => { + parse_string_view::(array, cast_options) + } + Time64(TimeUnit::Nanosecond) => { + parse_string_view::(array, cast_options) + } + Timestamp(TimeUnit::Second, to_tz) => { + cast_view_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Millisecond, to_tz) => { + cast_view_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Microsecond, to_tz) => { + cast_view_to_timestamp::(array, to_tz, cast_options) + } + Timestamp(TimeUnit::Nanosecond, to_tz) => { + cast_view_to_timestamp::(array, to_tz, cast_options) + } + Interval(IntervalUnit::YearMonth) => { + cast_view_to_year_month_interval(array, cast_options) + } + Interval(IntervalUnit::DayTime) => cast_view_to_day_time_interval(array, cast_options), + Interval(IntervalUnit::MonthDayNano) => { + cast_view_to_month_day_nano_interval(array, cast_options) + } + _ => Err(ArrowError::CastError(format!( + "Casting from {from_type:?} to {to_type:?} not supported", + ))), + }, (LargeUtf8, _) => match to_type { UInt8 => parse_string::(array, cast_options), UInt16 => parse_string::(array, cast_options), @@ -1365,8 +1414,6 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (Utf8View, Utf8) => cast_view_to_byte::>(array), - (Utf8View, LargeUtf8) => cast_view_to_byte::>(array), (BinaryView, Binary) => cast_view_to_byte::>(array), (BinaryView, LargeBinary) => { cast_view_to_byte::>(array) @@ -3960,6 +4007,11 @@ mod tests { #[test] fn test_cast_string_to_timestamp() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("2020-09-08T12:00:00.123456789+00:00"), + Some("Not a valid date"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("2020-09-08T12:00:00.123456789+00:00"), Some("Not a valid date"), @@ -3970,7 +4022,7 @@ mod tests { Some("Not a valid date"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { for time_unit in &[ TimeUnit::Second, TimeUnit::Millisecond, @@ -4039,6 +4091,11 @@ mod tests { #[test] fn test_cast_string_to_date32() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("2018-12-25"), + Some("Not a valid date"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("2018-12-25"), Some("Not a valid date"), @@ -4049,7 +4106,7 @@ mod tests { Some("Not a valid date"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Date32; let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); @@ -4071,30 +4128,47 @@ mod tests { #[test] fn test_cast_string_format_yyyymmdd_to_date32() { - let a = Arc::new(StringArray::from(vec![ + let a0 = Arc::new(StringViewArray::from(vec![ + Some("2020-12-25"), + Some("20201117"), + ])) as ArrayRef; + let a1 = Arc::new(StringArray::from(vec![ + Some("2020-12-25"), + Some("20201117"), + ])) as ArrayRef; + let a2 = Arc::new(LargeStringArray::from(vec![ Some("2020-12-25"), Some("20201117"), ])) as ArrayRef; - let to_type = DataType::Date32; - let options = CastOptions { - safe: false, - format_options: FormatOptions::default(), - }; - let result = cast_with_options(&a, &to_type, &options).unwrap(); - let c = result.as_primitive::(); - assert_eq!( - chrono::NaiveDate::from_ymd_opt(2020, 12, 25), - c.value_as_date(0) - ); - assert_eq!( - chrono::NaiveDate::from_ymd_opt(2020, 11, 17), - c.value_as_date(1) - ); + for array in &[a0, a1, a2] { + let to_type = DataType::Date32; + let options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + let result = cast_with_options(&array, &to_type, &options).unwrap(); + let c = result.as_primitive::(); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 12, 25), + c.value_as_date(0) + ); + assert_eq!( + chrono::NaiveDate::from_ymd_opt(2020, 11, 17), + c.value_as_date(1) + ); + } } #[test] fn test_cast_string_to_time32second() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("08:08:35.091323414"), Some("08:08:60.091323414"), // leap second @@ -4109,7 +4183,7 @@ mod tests { Some("Not a valid time"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Time32(TimeUnit::Second); let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); @@ -4130,6 +4204,13 @@ mod tests { #[test] fn test_cast_string_to_time32millisecond() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("08:08:35.091323414"), + Some("08:08:60.091323414"), // leap second + Some("08:08:61.091323414"), // not valid + Some("Not a valid time"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("08:08:35.091323414"), Some("08:08:60.091323414"), // leap second @@ -4144,7 +4225,7 @@ mod tests { Some("Not a valid time"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Time32(TimeUnit::Millisecond); let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); @@ -4165,6 +4246,11 @@ mod tests { #[test] fn test_cast_string_to_time64microsecond() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("08:08:35.091323414"), Some("Not a valid time"), @@ -4175,7 +4261,7 @@ mod tests { Some("Not a valid time"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Time64(TimeUnit::Microsecond); let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); @@ -4194,6 +4280,11 @@ mod tests { #[test] fn test_cast_string_to_time64nanosecond() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("08:08:35.091323414"), + Some("Not a valid time"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("08:08:35.091323414"), Some("Not a valid time"), @@ -4204,7 +4295,7 @@ mod tests { Some("Not a valid time"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Time64(TimeUnit::Nanosecond); let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); @@ -4223,6 +4314,11 @@ mod tests { #[test] fn test_cast_string_to_date64() { + let a0 = Arc::new(StringViewArray::from(vec![ + Some("2020-09-08T12:00:00"), + Some("Not a valid date"), + None, + ])) as ArrayRef; let a1 = Arc::new(StringArray::from(vec![ Some("2020-09-08T12:00:00"), Some("Not a valid date"), @@ -4233,7 +4329,7 @@ mod tests { Some("Not a valid date"), None, ])) as ArrayRef; - for array in &[a1, a2] { + for array in &[a0, a1, a2] { let to_type = DataType::Date64; let b = cast(array, &to_type).unwrap(); let c = b.as_primitive::(); diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 4b83a2a5e7da..7d0e7e21c859 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -16,6 +16,7 @@ // under the License. use crate::cast::*; +use arrow_buffer::NullBuffer; pub(crate) fn value_to_string( array: &dyn Array, @@ -43,8 +44,34 @@ pub(crate) fn parse_string( cast_options: &CastOptions, ) -> Result { let string_array = array.as_string::(); + parse_string_iter::(string_array.iter(), cast_options, || { + string_array.nulls().cloned() + }) +} + +/// Parse UTF-8 View +pub(crate) fn parse_string_view( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + let string_view_array = array.as_string_view(); + parse_string_iter::(string_view_array.iter(), cast_options, || { + string_view_array.nulls().cloned() + }) +} + +fn parse_string_iter< + 'a, + P: Parser, + I: Iterator>, + F: FnOnce() -> Option, +>( + iter: I, + cast_options: &CastOptions, + nulls: F, +) -> Result { let array = if cast_options.safe { - let iter = string_array.iter().map(|x| x.and_then(P::parse)); + let iter = iter.map(|x| x.and_then(P::parse)); // Benefit: // 20% performance improvement @@ -52,8 +79,7 @@ pub(crate) fn parse_string( // The iterator is trustedLen because it comes from an `StringArray`. unsafe { PrimitiveArray::

::from_trusted_len_iter(iter) } } else { - let v = string_array - .iter() + let v = iter .map(|x| match x { Some(v) => P::parse(v).ok_or_else(|| { ArrowError::CastError(format!( @@ -65,7 +91,7 @@ pub(crate) fn parse_string( None => Ok(P::Native::default()), }) .collect::, ArrowError>>()?; - PrimitiveArray::new(v.into(), string_array.nulls().cloned()) + PrimitiveArray::new(v.into(), nulls()) }; Ok(Arc::new(array) as ArrayRef) @@ -81,20 +107,42 @@ pub(crate) fn cast_string_to_timestamp = match to_tz { Some(tz) => { let tz: Tz = tz.as_ref().parse()?; - cast_string_to_timestamp_impl(array, &tz, cast_options)? + cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)? + } + None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?, + }; + Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) +} + +/// Casts string view arrays to an ArrowTimestampType (TimeStampNanosecondArray, etc.) +pub(crate) fn cast_view_to_timestamp( + array: &dyn Array, + to_tz: &Option>, + cast_options: &CastOptions, +) -> Result { + let array = array.as_string_view(); + let out: PrimitiveArray = match to_tz { + Some(tz) => { + let tz: Tz = tz.as_ref().parse()?; + cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)? } - None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?, + None => cast_string_to_timestamp_impl(array.iter(), &Utc, cast_options)?, }; Ok(Arc::new(out.with_timezone_opt(to_tz.clone()))) } -fn cast_string_to_timestamp_impl( - array: &GenericStringArray, +fn cast_string_to_timestamp_impl< + 'a, + I: Iterator>, + T: ArrowTimestampType, + Tz: TimeZone, +>( + iter: I, tz: &Tz, cast_options: &CastOptions, ) -> Result, ArrowError> { if cast_options.safe { - let iter = array.iter().map(|v| { + let iter = iter.map(|v| { v.and_then(|v| { let naive = string_to_datetime(tz, v).ok()?.naive_utc(); T::make_value(naive) @@ -107,8 +155,7 @@ fn cast_string_to_timestamp_impl>, _>>()?; @@ -148,29 +195,11 @@ where .as_any() .downcast_ref::>() .unwrap(); - let interval_array = if cast_options.safe { - let iter = string_array - .iter() - .map(|v| v.and_then(|v| parse_function(v).ok())); - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } - } else { - let vec = string_array - .iter() - .map(|v| v.map(parse_function).transpose()) - .collect::, ArrowError>>()?; - - // Benefit: - // 20% performance improvement - // Soundness: - // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } - }; - Ok(Arc::new(interval_array) as ArrayRef) + cast_string_to_interval_impl::<_, ArrowType, F>( + string_array.iter(), + cast_options, + parse_function, + ) } pub(crate) fn cast_string_to_year_month_interval( @@ -206,6 +235,84 @@ pub(crate) fn cast_string_to_month_day_nano_interval( ) } +pub(crate) fn cast_view_to_interval( + array: &dyn Array, + cast_options: &CastOptions, + parse_function: F, +) -> Result +where + ArrowType: ArrowPrimitiveType, + F: Fn(&str) -> Result + Copy, +{ + let string_view_array = array.as_any().downcast_ref::().unwrap(); + cast_string_to_interval_impl::<_, ArrowType, F>( + string_view_array.iter(), + cast_options, + parse_function, + ) +} + +pub(crate) fn cast_view_to_year_month_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_view_to_interval::<_, IntervalYearMonthType>( + array, + cast_options, + parse_interval_year_month, + ) +} + +pub(crate) fn cast_view_to_day_time_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, parse_interval_day_time) +} + +pub(crate) fn cast_view_to_month_day_nano_interval( + array: &dyn Array, + cast_options: &CastOptions, +) -> Result { + cast_view_to_interval::<_, IntervalMonthDayNanoType>( + array, + cast_options, + parse_interval_month_day_nano, + ) +} + +fn cast_string_to_interval_impl<'a, I, ArrowType, F>( + iter: I, + cast_options: &CastOptions, + parse_function: F, +) -> Result +where + I: Iterator>, + ArrowType: ArrowPrimitiveType, + F: Fn(&str) -> Result + Copy, +{ + let interval_array = if cast_options.safe { + let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok())); + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { PrimitiveArray::::from_trusted_len_iter(iter) } + } else { + let vec = iter + .map(|v| v.map(parse_function).transpose()) + .collect::, ArrowError>>()?; + + // Benefit: + // 20% performance improvement + // Soundness: + // The iterator is trustedLen because it comes from an `StringArray`. + unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } + }; + Ok(Arc::new(interval_array) as ArrayRef) +} + /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. pub(crate) fn cast_binary_to_string(