From 26aa19943a457d0c156ec1e160c43c728900a0e4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 Oct 2023 10:27:28 -0700 Subject: [PATCH 1/4] Support casting from integer to binary --- arrow-cast/src/cast.rs | 80 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 97307f076f34..edcd98257054 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -47,7 +47,7 @@ use crate::parse::{ string_to_datetime, Parser, }; use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer, ToByteSlice}; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -203,6 +203,8 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16, (_, Utf8 | LargeUtf8) => from_type.is_primitive(), + (_, Binary | LargeBinary) => from_type.is_integer(), + // start numeric casts ( UInt8 | UInt16 | UInt32 | UInt64 | Int8 | Int16 | Int32 | Int64 | Float32 | Float64, @@ -1368,6 +1370,28 @@ pub fn cast_with_options( (from_type, Utf8) if from_type.is_primitive() => { value_to_string::(array, cast_options) } + (from_type, Binary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, + (from_type, LargeBinary) if from_type.is_integer() => match from_type { + UInt8 => cast_numeric_to_binary::(array), + UInt16 => cast_numeric_to_binary::(array), + UInt32 => cast_numeric_to_binary::(array), + UInt64 => cast_numeric_to_binary::(array), + Int8 => cast_numeric_to_binary::(array), + Int16 => cast_numeric_to_binary::(array), + Int32 => cast_numeric_to_binary::(array), + Int64 => cast_numeric_to_binary::(array), + _ => unreachable!(), + }, // start numeric casts (UInt8, UInt16) => cast_numeric_arrays::(array, cast_options), (UInt8, UInt32) => cast_numeric_arrays::(array, cast_options), @@ -2317,6 +2341,22 @@ fn value_to_string( Ok(Arc::new(builder.finish())) } +fn cast_numeric_to_binary( + array: &dyn Array, +) -> Result { + let array = array.as_primitive::(); + + let mut builder = GenericBinaryBuilder::::new(); + for i in 0..array.len() { + if array.is_null(i) { + builder.append_null(); + } else { + builder.append_value(array.value(i).to_byte_slice()); + } + } + Ok(Arc::new(builder.finish())) +} + /// Parse UTF-8 fn parse_string( array: &dyn Array, @@ -5176,6 +5216,44 @@ mod tests { assert!(down_cast.is_null(2)); } + #[test] + fn test_numeric_to_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::Binary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&(-1 as i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + + #[test] + fn test_numeric_to_large_binary() { + let a = Int16Array::from(vec![Some(1), Some(511), None]); + + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&1_i16.to_le_bytes(), down_cast.value(0)); + assert_eq!(&511_i16.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + + let a = Int64Array::from(vec![Some(-1), Some(123456789), None]); + + let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); + let down_cast = array_ref.as_binary::(); + assert_eq!(&(-1 as i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); + assert!(down_cast.is_null(2)); + } + #[test] fn test_cast_date32_to_int32() { let array = Date32Array::from(vec![10000, 17890]); From 61a34175fb676aac5f5564bc3e1c625ad26ec8ad Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 1 Nov 2023 00:19:08 -0700 Subject: [PATCH 2/4] Fix clippy --- arrow-cast/src/cast.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index edcd98257054..80ded729e5d9 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -5230,7 +5230,7 @@ mod tests { let array_ref = cast(&a, &DataType::Binary).unwrap(); let down_cast = array_ref.as_binary::(); - assert_eq!(&(-1 as i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); assert!(down_cast.is_null(2)); } @@ -5249,7 +5249,7 @@ mod tests { let array_ref = cast(&a, &DataType::LargeBinary).unwrap(); let down_cast = array_ref.as_binary::(); - assert_eq!(&(-1 as i64).to_le_bytes(), down_cast.value(0)); + assert_eq!(&(-1_i64).to_le_bytes(), down_cast.value(0)); assert_eq!(&123456789_i64.to_le_bytes(), down_cast.value(1)); assert!(down_cast.is_null(2)); } From 361894699ebdee47505cda8f7e7f1cc42e24de88 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Wed, 1 Nov 2023 09:48:54 -0700 Subject: [PATCH 3/4] For review --- arrow-cast/src/cast.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index 80ded729e5d9..d223420ce34a 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -2346,7 +2346,8 @@ fn cast_numeric_to_binary( ) -> Result { let array = array.as_primitive::(); - let mut builder = GenericBinaryBuilder::::new(); + let mut builder = + GenericBinaryBuilder::::with_capacity(array.len(), array.values().inner().capacity()); for i in 0..array.len() { if array.is_null(i) { builder.append_null(); From 0369d5f454cab983749c3e3b48a02e461c9ee6f8 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 2 Nov 2023 13:24:44 -0700 Subject: [PATCH 4/4] Reuse array buffers --- arrow-cast/src/cast.rs | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs index d223420ce34a..684e02b87e6c 100644 --- a/arrow-cast/src/cast.rs +++ b/arrow-cast/src/cast.rs @@ -47,7 +47,7 @@ use crate::parse::{ string_to_datetime, Parser, }; use arrow_array::{builder::*, cast::*, temporal_conversions::*, timezone::Tz, types::*, *}; -use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer, ToByteSlice}; +use arrow_buffer::{i256, ArrowNativeType, Buffer, OffsetBuffer}; use arrow_data::ArrayData; use arrow_schema::*; use arrow_select::take::take; @@ -2345,17 +2345,13 @@ fn cast_numeric_to_binary( array: &dyn Array, ) -> Result { let array = array.as_primitive::(); - - let mut builder = - GenericBinaryBuilder::::with_capacity(array.len(), array.values().inner().capacity()); - for i in 0..array.len() { - if array.is_null(i) { - builder.append_null(); - } else { - builder.append_value(array.value(i).to_byte_slice()); - } - } - Ok(Arc::new(builder.finish())) + let size = std::mem::size_of::(); + let offsets = OffsetBuffer::from_lengths(std::iter::repeat(size).take(array.len())); + Ok(Arc::new(GenericBinaryArray::::new( + offsets, + array.values().inner().clone(), + array.nulls().cloned(), + ))) } /// Parse UTF-8