From 2de1fb501941142cd8306291ed338d22be391281 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 6 Feb 2023 12:45:58 +0000 Subject: [PATCH 1/4] Use ArrayFormatter in cast kernel --- arrow-cast/src/cast.rs | 259 ++++------------------------------------- 1 file changed, 24 insertions(+), 235 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 69e42a5485e6..b59ada5800e6 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -35,15 +35,14 @@ //! assert_eq!(7.0, c.value(2)); //! ``` -use chrono::{DateTime, NaiveDateTime, NaiveTime, Timelike}; +use chrono::{NaiveTime, Timelike}; use std::cmp::Ordering; use std::sync::Arc; -use crate::display::{array_value_to_string, lexical_to_string}; +use crate::display::{array_value_to_string, ArrayFormatter, FormatOptions}; use crate::parse::string_to_timestamp_nanos; use arrow_array::{ - builder::*, cast::*, iterator::ArrayIter, temporal_conversions::*, timezone::Tz, - types::*, *, + builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *, }; use arrow_buffer::{i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_data::ArrayData; @@ -1152,75 +1151,12 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (_, Utf8) => match from_type { - LargeUtf8 => cast_byte_container::(array), - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< - TimestampNanosecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< - TimestampMicrosecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< - TimestampMillisecondType, - i32, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz.as_ref()) - } - Date32 => cast_date32_to_string::(array), - Date64 => cast_date64_to_string::(array), - Binary => cast_binary_to_string::(array, cast_options), - LargeBinary => cast_binary_to_generic_string::(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, - (_, LargeUtf8) => match from_type { - UInt8 => cast_numeric_to_string::(array), - UInt16 => cast_numeric_to_string::(array), - UInt32 => cast_numeric_to_string::(array), - UInt64 => cast_numeric_to_string::(array), - Int8 => cast_numeric_to_string::(array), - Int16 => cast_numeric_to_string::(array), - Int32 => cast_numeric_to_string::(array), - Int64 => cast_numeric_to_string::(array), - Float32 => cast_numeric_to_string::(array), - Float64 => cast_numeric_to_string::(array), - Timestamp(TimeUnit::Nanosecond, tz) => cast_timestamp_to_string::< - TimestampNanosecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Microsecond, tz) => cast_timestamp_to_string::< - TimestampMicrosecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Millisecond, tz) => cast_timestamp_to_string::< - TimestampMillisecondType, - i64, - >(array, tz.as_ref()), - Timestamp(TimeUnit::Second, tz) => { - cast_timestamp_to_string::(array, tz.as_ref()) - } - Date32 => cast_date32_to_string::(array), - Date64 => cast_date64_to_string::(array), - Binary => cast_binary_to_generic_string::(array, cast_options), - LargeBinary => cast_binary_to_string::(array, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", - ))), - }, + (Binary, Utf8) => cast_binary_to_string::(array, cast_options), + (LargeBinary, LargeUtf8) => cast_binary_to_string::(array, cast_options), + (LargeBinary, Utf8) => cast_binary_to_generic_string::(array, cast_options), + (Binary, LargeUtf8) => cast_binary_to_generic_string::(array, cast_options), + (_, LargeUtf8) => value_to_string::(array), + (_, Utf8) => value_to_string::(array), (LargeUtf8, _) => match to_type { UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), @@ -2171,172 +2107,25 @@ where from.unary_opt::<_, R>(num::cast::cast::) } -fn as_time_with_string_op< - A: ArrayAccessor, - OffsetSize, - T: ArrowTemporalType, - F, ->( - iter: ArrayIter, - mut builder: GenericStringBuilder, - op: F, -) -> ArrayRef -where - OffsetSize: OffsetSizeTrait, - F: Fn(NaiveDateTime) -> String, - i64: From, -{ - iter.into_iter().for_each(|value| { - if let Some(value) = value { - match as_datetime::(>::from(value)) { - Some(dt) => builder.append_value(op(dt)), - None => builder.append_null(), +fn value_to_string( + array: &dyn Array, +) -> Result { + let mut builder = GenericStringBuilder::::new(); + let options = FormatOptions::default(); + let formatter = ArrayFormatter::try_new(array, &options)?; + let data = array.data(); + for i in 0..data.len() { + match data.is_null(i) { + true => builder.append_null(), + false => { + formatter.value(i).write(&mut builder)?; + builder.append_value(""); } - } else { - builder.append_null(); - } - }); - - Arc::new(builder.finish()) -} - -fn extract_component_from_datetime_array< - A: ArrayAccessor, - OffsetSize, - T: ArrowTemporalType, - F, ->( - iter: ArrayIter, - mut builder: GenericStringBuilder, - tz: &str, - op: F, -) -> Result -where - OffsetSize: OffsetSizeTrait, - F: Fn(DateTime) -> String, - i64: From, -{ - let tz: Tz = tz.parse()?; - for value in iter { - match value { - Some(value) => match as_datetime_with_timezone::(value.into(), tz) { - Some(time) => builder.append_value(op(time)), - _ => { - return Err(ArrowError::ComputeError( - "Unable to read value as datetime".to_string(), - )); - } - }, - None => builder.append_null(), } } Ok(Arc::new(builder.finish())) } -/// Cast timestamp types to Utf8/LargeUtf8 -fn cast_timestamp_to_string( - array: &dyn Array, - tz: Option<&String>, -) -> Result -where - T: ArrowTemporalType + ArrowPrimitiveType, - i64: From<::Native>, - OffsetSize: OffsetSizeTrait, -{ - let array = array.as_any().downcast_ref::>().unwrap(); - - let builder = GenericStringBuilder::::new(); - - if let Some(tz) = tz { - // The macro calls `as_datetime` on timestamp values of the array. - // After applying timezone offset on the datatime, calling `to_string` to get - // the strings. - let iter = ArrayIter::new(array); - extract_component_from_datetime_array::<_, OffsetSize, T, _>( - iter, - builder, - tz, - |t| t.to_string(), - ) - } else { - // No timezone available. Calling `to_string` on the datatime value simply. - let iter = ArrayIter::new(array); - Ok(as_time_with_string_op::<_, OffsetSize, T, _>( - iter, - builder, - |t| t.to_string(), - )) - } -} - -/// Cast date32 types to Utf8/LargeUtf8 -fn cast_date32_to_string( - array: &dyn Array, -) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - Ok(Arc::new( - (0..array.len()) - .map(|ix| { - if array.is_null(ix) { - None - } else { - array.value_as_date(ix).map(|v| v.to_string()) - } - }) - .collect::>(), - )) -} - -/// Cast date64 types to Utf8/LargeUtf8 -fn cast_date64_to_string( - array: &dyn Array, -) -> Result { - let array = array.as_any().downcast_ref::().unwrap(); - - Ok(Arc::new( - (0..array.len()) - .map(|ix| { - if array.is_null(ix) { - None - } else { - array.value_as_datetime(ix).map(|v| v.to_string()) - } - }) - .collect::>(), - )) -} - -/// Cast numeric types to Utf8 -fn cast_numeric_to_string( - array: &dyn Array, -) -> Result -where - FROM: ArrowPrimitiveType, - FROM::Native: lexical_core::ToLexical, - OffsetSize: OffsetSizeTrait, -{ - Ok(Arc::new(numeric_to_string_cast::( - array - .as_any() - .downcast_ref::>() - .unwrap(), - ))) -} - -fn numeric_to_string_cast( - from: &PrimitiveArray, -) -> GenericStringArray -where - T: ArrowPrimitiveType + ArrowPrimitiveType, - T::Native: lexical_core::ToLexical, - OffsetSize: OffsetSizeTrait, -{ - from.iter() - .map(|maybe_value| maybe_value.map(lexical_to_string)) - .collect() -} - /// Cast numeric types to Utf8 fn cast_string_to_numeric( from: &dyn Array, @@ -5521,8 +5310,8 @@ mod tests { let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00", c.value(1)); + assert_eq!("1997-05-19T00:00:00", c.value(0)); + assert_eq!("2018-12-25T00:00:00", c.value(1)); } #[test] From 7b89bfcc868165f741dbf02cf6228d03914a3d52 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 8 Feb 2023 18:04:03 +0000 Subject: [PATCH 2/4] Fixes --- arrow-cast/src/cast.rs | 129 +++++++++-------------------------------- 1 file changed, 28 insertions(+), 101 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index b59ada5800e6..2a4679df118a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -154,13 +154,12 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (_, Boolean) => DataType::is_numeric(from_type) || from_type == &Utf8 || from_type == &LargeUtf8, (Boolean, _) => DataType::is_numeric(to_type) || to_type == &Utf8 || to_type == &LargeUtf8, - (Utf8, LargeUtf8) => true, - (LargeUtf8, Utf8) => true, (Binary, LargeBinary | Utf8 | LargeUtf8) => true, (LargeBinary, Binary | Utf8 | LargeUtf8) => true, (Utf8, Binary | LargeBinary + | LargeUtf8 | Date32 | Date64 | Time32(TimeUnit::Second) @@ -169,10 +168,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, + (Utf8, _) => to_type.is_numeric() && to_type != &Float16, (LargeUtf8, Binary | LargeBinary + | Utf8 | Date32 | Date64 | Time32(TimeUnit::Second) @@ -181,11 +181,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { | Time64(TimeUnit::Nanosecond) | Timestamp(TimeUnit::Nanosecond, None) ) => true, - (LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16, - (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true, - (Date32, Utf8) | (Date32, LargeUtf8) => true, - (Date64, Utf8) | (Date64, LargeUtf8) => true, - (_, Utf8 | LargeUtf8) => DataType::is_numeric(from_type) && from_type != &Float16, + (LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, + (_, Utf8 | LargeUtf8) => from_type.is_primitive(), // start numeric casts ( @@ -1114,7 +1111,6 @@ pub fn cast_with_options( ))), }, (Utf8, _) => match to_type { - LargeUtf8 => cast_byte_container::(array), UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), UInt32 => cast_string_to_numeric::(array, cast_options), @@ -1130,8 +1126,9 @@ pub fn cast_with_options( Binary => Ok(Arc::new(BinaryArray::from(as_string_array(array).clone()))), LargeBinary => { let binary = BinaryArray::from(as_string_array(array).clone()); - cast_byte_container::(&binary) + cast_byte_container::(&binary) } + LargeUtf8 => cast_byte_container::(array), Time32(TimeUnit::Second) => { cast_string_to_time32second::(array, cast_options) } @@ -1151,12 +1148,6 @@ pub fn cast_with_options( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, - (Binary, Utf8) => cast_binary_to_string::(array, cast_options), - (LargeBinary, LargeUtf8) => cast_binary_to_string::(array, cast_options), - (LargeBinary, Utf8) => cast_binary_to_generic_string::(array, cast_options), - (Binary, LargeUtf8) => cast_binary_to_generic_string::(array, cast_options), - (_, LargeUtf8) => value_to_string::(array), - (_, Utf8) => value_to_string::(array), (LargeUtf8, _) => match to_type { UInt8 => cast_string_to_numeric::(array, cast_options), UInt16 => cast_string_to_numeric::(array, cast_options), @@ -1170,10 +1161,11 @@ pub fn cast_with_options( Float64 => cast_string_to_numeric::(array, cast_options), Date32 => cast_string_to_date32::(array, cast_options), Date64 => cast_string_to_date64::(array, cast_options), + Utf8 => cast_byte_container::(array), Binary => { let large_binary = LargeBinaryArray::from(as_largestring_array(array).clone()); - cast_byte_container::(&large_binary) + cast_byte_container::(&large_binary) } LargeBinary => Ok(Arc::new(LargeBinaryArray::from( as_largestring_array(array).clone(), @@ -1198,19 +1190,31 @@ pub fn cast_with_options( ))), }, (Binary, _) => match to_type { + Utf8 => cast_binary_to_string::(array, cast_options), + LargeUtf8 => { + let array = cast_binary_to_string::(array, cast_options)?; + cast_byte_container::(array.as_ref()) + } LargeBinary => { - cast_byte_container::(array) + cast_byte_container::(array) } _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, (LargeBinary, _) => match to_type { - Binary => cast_byte_container::(array), + Utf8 => { + let array = cast_binary_to_string::(array, cast_options)?; + cast_byte_container::(array.as_ref()) + } + LargeUtf8 => cast_binary_to_string::(array, cast_options), + Binary => cast_byte_container::(array), _ => Err(ArrowError::CastError(format!( "Casting from {from_type:?} to {to_type:?} not supported", ))), }, + (from_type, LargeUtf8) if from_type.is_primitive() => value_to_string::(array), + (from_type, Utf8) if from_type.is_primitive() => value_to_string::(array), // start numeric casts (UInt8, UInt16) => { cast_numeric_arrays::(array, cast_options) @@ -3188,13 +3192,10 @@ fn cast_list_inner( /// A specified helper to cast from `GenericBinaryArray` to `GenericStringArray` when they have same /// offset size so re-encoding offset is unnecessary. -fn cast_binary_to_string( +fn cast_binary_to_string( array: &dyn Array, cast_options: &CastOptions, -) -> Result -where - O: OffsetSizeTrait + ToPrimitive, -{ +) -> Result { let array = array .as_any() .downcast_ref::>>() @@ -3246,86 +3247,12 @@ where } } -/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. This function performs -/// UTF8 validation during casting. For invalid UTF8 value, it could be Null or returning `Err` depending -/// `CastOptions`. -fn cast_binary_to_generic_string( - array: &dyn Array, - cast_options: &CastOptions, -) -> Result -where - I: OffsetSizeTrait + ToPrimitive, - O: OffsetSizeTrait + NumCast, -{ - let array = array - .as_any() - .downcast_ref::>>() - .unwrap(); - - if !cast_options.safe { - let offsets = array.value_offsets(); - let values = array.value_data(); - - // We only need to validate that all values are valid UTF-8 - let validated = std::str::from_utf8(values) - .map_err(|_| ArrowError::CastError("Invalid UTF-8 sequence".to_string()))?; - - let mut offset_builder = BufferBuilder::::new(offsets.len()); - // Checks if the offset is a valid char boundary and re-encode the offset - offsets - .iter() - .try_for_each::<_, Result<_, ArrowError>>(|offset| { - if !validated.is_char_boundary(offset.as_usize()) { - return Err(ArrowError::CastError( - "Invalid UTF-8 sequence".to_string(), - )); - } - - let offset = ::from(*offset).ok_or_else(|| { - ArrowError::ComputeError(format!( - "{}Binary array too large to cast to {}String array", - I::PREFIX, - O::PREFIX - )) - })?; - offset_builder.append(offset); - Ok(()) - })?; - - let offset_buffer = offset_builder.finish(); - - let builder = ArrayData::builder(GenericStringArray::::DATA_TYPE) - .len(array.len()) - .add_buffer(offset_buffer) - .add_buffer(array.data().buffers()[1].clone()) - .null_count(array.null_count()) - .null_bit_buffer(array.data().null_buffer().cloned()); - - // SAFETY: - // Validated UTF-8 above - Ok(Arc::new(GenericStringArray::::from(unsafe { - builder.build_unchecked() - }))) - } else { - Ok(Arc::new( - array - .iter() - .map(|maybe_value| { - maybe_value.and_then(|value| std::str::from_utf8(value).ok()) - }) - .collect::>>(), - )) - } -} - /// Helper function to cast from one `ByteArrayType` to another and vice versa. /// If the target one (e.g., `LargeUtf8`) is too large for the source array it will return an Error. -fn cast_byte_container( - array: &dyn Array, -) -> Result +fn cast_byte_container(array: &dyn Array) -> Result where - FROM: ByteArrayType, - TO: ByteArrayType, + FROM: ByteArrayType, + TO: ByteArrayType, FROM::Offset: OffsetSizeTrait + ToPrimitive, TO::Offset: OffsetSizeTrait + NumCast, { From b2847485fab40b4389ff1bcef21d1bb684bf7602 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 8 Feb 2023 18:20:31 +0000 Subject: [PATCH 3/4] Further fixes --- arrow/tests/array_cast.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow/tests/array_cast.rs b/arrow/tests/array_cast.rs index ae73b1b4200b..7eeb00a8290a 100644 --- a/arrow/tests/array_cast.rs +++ b/arrow/tests/array_cast.rs @@ -56,8 +56,8 @@ fn test_cast_timestamp_to_string() { let b = cast(&array, &DataType::Utf8).unwrap(); let c = b.as_any().downcast_ref::().unwrap(); assert_eq!(&DataType::Utf8, c.data_type()); - assert_eq!("1997-05-19 00:00:00.005 +00:00", c.value(0)); - assert_eq!("2018-12-25 00:00:00.001 +00:00", c.value(1)); + assert_eq!("1997-05-19T00:00:00.005Z", c.value(0)); + assert_eq!("2018-12-25T00:00:00.001Z", c.value(1)); assert!(c.is_null(2)); } @@ -442,9 +442,9 @@ fn test_timestamp_cast_utf8() { let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); let expected = StringArray::from(vec![ - Some("1970-01-01 10:30:00"), + Some("1970-01-01T10:30:00"), None, - Some("1970-01-01 23:58:59"), + Some("1970-01-01T23:58:59"), ]); assert_eq!( @@ -458,9 +458,9 @@ fn test_timestamp_cast_utf8() { let out = cast(&(Arc::new(array) as ArrayRef), &DataType::Utf8).unwrap(); let expected = StringArray::from(vec![ - Some("1970-01-01 20:30:00 +10:00"), + Some("1970-01-01T20:30:00+10:00"), None, - Some("1970-01-02 09:58:59 +10:00"), + Some("1970-01-02T09:58:59+10:00"), ]); assert_eq!( From f707a70c0d118db2bdf6e7d16cd445b8663a6206 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 9 Feb 2023 12:18:30 +0000 Subject: [PATCH 4/4] Update arrow-cast/src/cast.rs Co-authored-by: Andrew Lamb --- arrow-cast/src/cast.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 2a4679df118a..3137e685b212 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2123,6 +2123,7 @@ fn value_to_string( true => builder.append_null(), false => { formatter.value(i).write(&mut builder)?; + // tell the builder the row is finished builder.append_value(""); } }