Skip to content

Commit

Permalink
Enable casting from Utf8View (#6077)
Browse files Browse the repository at this point in the history
* Enable casting from Utf8View -> string or temporal types

* save

* implement casting utf8view -> timestamp/interval types, with tests

* fix clippy

* fmt

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
a10y and alamb authored Jul 19, 2024
1 parent 9be0eb5 commit 8a5be13
Show file tree
Hide file tree
Showing 2 changed files with 265 additions and 62 deletions.
150 changes: 123 additions & 27 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | BinaryView) => true,
(FixedSizeBinary(_), Binary | LargeBinary) => true,
(
Utf8 | LargeUtf8,
Utf8 | LargeUtf8 | Utf8View,
Binary
| LargeBinary
| Utf8
Expand All @@ -228,7 +228,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Interval(_),
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
(Utf8View, Utf8 | LargeUtf8) => true,
(BinaryView, Binary | LargeBinary) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
Expand Down Expand Up @@ -1269,6 +1268,56 @@ pub fn cast_with_options(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
},
(Utf8View, _) => match to_type {
UInt8 => parse_string_view::<UInt8Type>(array, cast_options),
UInt16 => parse_string_view::<UInt16Type>(array, cast_options),
UInt32 => parse_string_view::<UInt32Type>(array, cast_options),
UInt64 => parse_string_view::<UInt64Type>(array, cast_options),
Int8 => parse_string_view::<Int8Type>(array, cast_options),
Int16 => parse_string_view::<Int16Type>(array, cast_options),
Int32 => parse_string_view::<Int32Type>(array, cast_options),
Int64 => parse_string_view::<Int64Type>(array, cast_options),
Float32 => parse_string_view::<Float32Type>(array, cast_options),
Float64 => parse_string_view::<Float64Type>(array, cast_options),
Date32 => parse_string_view::<Date32Type>(array, cast_options),
Date64 => parse_string_view::<Date64Type>(array, cast_options),
Binary => cast_view_to_byte::<StringViewType, GenericBinaryType<i32>>(array),
LargeBinary => cast_view_to_byte::<StringViewType, GenericBinaryType<i64>>(array),
Utf8 => cast_view_to_byte::<StringViewType, GenericStringType<i32>>(array),
LargeUtf8 => cast_view_to_byte::<StringViewType, GenericStringType<i64>>(array),
Time32(TimeUnit::Second) => parse_string_view::<Time32SecondType>(array, cast_options),
Time32(TimeUnit::Millisecond) => {
parse_string_view::<Time32MillisecondType>(array, cast_options)
}
Time64(TimeUnit::Microsecond) => {
parse_string_view::<Time64MicrosecondType>(array, cast_options)
}
Time64(TimeUnit::Nanosecond) => {
parse_string_view::<Time64NanosecondType>(array, cast_options)
}
Timestamp(TimeUnit::Second, to_tz) => {
cast_view_to_timestamp::<TimestampSecondType>(array, to_tz, cast_options)
}
Timestamp(TimeUnit::Millisecond, to_tz) => {
cast_view_to_timestamp::<TimestampMillisecondType>(array, to_tz, cast_options)
}
Timestamp(TimeUnit::Microsecond, to_tz) => {
cast_view_to_timestamp::<TimestampMicrosecondType>(array, to_tz, cast_options)
}
Timestamp(TimeUnit::Nanosecond, to_tz) => {
cast_view_to_timestamp::<TimestampNanosecondType>(array, to_tz, cast_options)
}
Interval(IntervalUnit::YearMonth) => {
cast_view_to_year_month_interval(array, cast_options)
}
Interval(IntervalUnit::DayTime) => cast_view_to_day_time_interval(array, cast_options),
Interval(IntervalUnit::MonthDayNano) => {
cast_view_to_month_day_nano_interval(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
},
(LargeUtf8, _) => match to_type {
UInt8 => parse_string::<UInt8Type, i64>(array, cast_options),
UInt16 => parse_string::<UInt16Type, i64>(array, cast_options),
Expand Down Expand Up @@ -1365,8 +1414,6 @@ pub fn cast_with_options(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
},
(Utf8View, Utf8) => cast_view_to_byte::<StringViewType, GenericStringType<i32>>(array),
(Utf8View, LargeUtf8) => cast_view_to_byte::<StringViewType, GenericStringType<i64>>(array),
(BinaryView, Binary) => cast_view_to_byte::<BinaryViewType, GenericBinaryType<i32>>(array),
(BinaryView, LargeBinary) => {
cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
Expand Down Expand Up @@ -3960,6 +4007,11 @@ mod tests {

#[test]
fn test_cast_string_to_timestamp() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("2020-09-08T12:00:00.123456789+00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00.123456789+00:00"),
Some("Not a valid date"),
Expand All @@ -3970,7 +4022,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
for time_unit in &[
TimeUnit::Second,
TimeUnit::Millisecond,
Expand Down Expand Up @@ -4039,6 +4091,11 @@ mod tests {

#[test]
fn test_cast_string_to_date32() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("2018-12-25"),
Some("Not a valid date"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2018-12-25"),
Some("Not a valid date"),
Expand All @@ -4049,7 +4106,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Date32;
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Date32Type>();
Expand All @@ -4071,30 +4128,47 @@ mod tests {

#[test]
fn test_cast_string_format_yyyymmdd_to_date32() {
let a = Arc::new(StringArray::from(vec![
let a0 = Arc::new(StringViewArray::from(vec![
Some("2020-12-25"),
Some("20201117"),
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2020-12-25"),
Some("20201117"),
])) as ArrayRef;
let a2 = Arc::new(LargeStringArray::from(vec![
Some("2020-12-25"),
Some("20201117"),
])) as ArrayRef;

let to_type = DataType::Date32;
let options = CastOptions {
safe: false,
format_options: FormatOptions::default(),
};
let result = cast_with_options(&a, &to_type, &options).unwrap();
let c = result.as_primitive::<Date32Type>();
assert_eq!(
chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
c.value_as_date(0)
);
assert_eq!(
chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
c.value_as_date(1)
);
for array in &[a0, a1, a2] {
let to_type = DataType::Date32;
let options = CastOptions {
safe: false,
format_options: FormatOptions::default(),
};
let result = cast_with_options(&array, &to_type, &options).unwrap();
let c = result.as_primitive::<Date32Type>();
assert_eq!(
chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
c.value_as_date(0)
);
assert_eq!(
chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
c.value_as_date(1)
);
}
}

#[test]
fn test_cast_string_to_time32second() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
Some("08:08:61.091323414"), // not valid
Some("Not a valid time"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
Expand All @@ -4109,7 +4183,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Time32(TimeUnit::Second);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time32SecondType>();
Expand All @@ -4130,6 +4204,13 @@ mod tests {

#[test]
fn test_cast_string_to_time32millisecond() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
Some("08:08:61.091323414"), // not valid
Some("Not a valid time"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
Expand All @@ -4144,7 +4225,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Time32(TimeUnit::Millisecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time32MillisecondType>();
Expand All @@ -4165,6 +4246,11 @@ mod tests {

#[test]
fn test_cast_string_to_time64microsecond() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
Expand All @@ -4175,7 +4261,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Time64(TimeUnit::Microsecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time64MicrosecondType>();
Expand All @@ -4194,6 +4280,11 @@ mod tests {

#[test]
fn test_cast_string_to_time64nanosecond() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
Expand All @@ -4204,7 +4295,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Time64(TimeUnit::Nanosecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time64NanosecondType>();
Expand All @@ -4223,6 +4314,11 @@ mod tests {

#[test]
fn test_cast_string_to_date64() {
let a0 = Arc::new(StringViewArray::from(vec![
Some("2020-09-08T12:00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00"),
Some("Not a valid date"),
Expand All @@ -4233,7 +4329,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
for array in &[a0, a1, a2] {
let to_type = DataType::Date64;
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Date64Type>();
Expand Down
Loading

0 comments on commit 8a5be13

Please sign in to comment.