Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support UTF8 cast to Timestamp with timezone #3673

Merged
merged 4 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 42 additions & 4 deletions arrow-cast/src/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
| Timestamp(TimeUnit::Nanosecond, None)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
Comment on lines +171 to 172
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you have *tz == Some("+00:00".to_owned()) as condition, it seems also necessary to put it here. Otherwise there will be casting error because can_cast_types reports it is supported.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

(Utf8, _) => DataType::is_numeric(to_type) && to_type != &Float16,
(LargeUtf8,
Expand All @@ -180,7 +180,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
| Timestamp(TimeUnit::Nanosecond, None)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
(LargeUtf8, _) => DataType::is_numeric(to_type) && to_type != &Float16,
(Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true,
Expand Down Expand Up @@ -1145,7 +1145,7 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i32>(array, cast_options)
}
Timestamp(TimeUnit::Nanosecond, None) => {
Timestamp(TimeUnit::Nanosecond, _) => {
cast_string_to_timestamp_ns::<i32>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
Expand Down Expand Up @@ -1254,7 +1254,7 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i64>(array, cast_options)
}
Timestamp(TimeUnit::Nanosecond, None) => {
Timestamp(TimeUnit::Nanosecond, _) => {
cast_string_to_timestamp_ns::<i64>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
Expand Down Expand Up @@ -7833,4 +7833,42 @@ mod tests {
assert_eq!(v.value(0), 946728000000);
assert_eq!(v.value(1), 1608035696000);
}

#[test]
fn test_cast_utf8_to_timestamp() {
fn test_tz(tz: String) {
let valid = StringArray::from(vec![
"2023-01-01 04:05:06.789000-08:00",
"2023-01-01 04:05:06.789000-07:00",
"2023-01-01 04:05:06.789 -0800",
"2023-01-01 04:05:06.789 -08:00",
"2023-01-01 040506 +0730",
"2023-01-01 040506 +07:30",
"2023-01-01 04:05:06.789",
"2023-01-01 04:05:06",
"2023-01-01",
]);

let array = Arc::new(valid) as ArrayRef;
let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, Some(tz)))
.unwrap();

let c = b
.as_any()
.downcast_ref::<TimestampNanosecondArray>()
.unwrap();
assert_eq!(1672574706789000000, c.value(0));
assert_eq!(1672571106789000000, c.value(1));
assert_eq!(1672574706789000000, c.value(2));
assert_eq!(1672574706789000000, c.value(3));
assert_eq!(1672518906000000000, c.value(4));
assert_eq!(1672518906000000000, c.value(5));
assert_eq!(1672545906789000000, c.value(6));
assert_eq!(1672545906000000000, c.value(7));
assert_eq!(1672531200000000000, c.value(8));
}

test_tz("+00:00".to_owned());
test_tz("+02:00".to_owned());
}
}
21 changes: 17 additions & 4 deletions arrow-cast/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ use chrono::prelude::*;
/// the system timezone is set to Americas/New_York (UTC-5) the
/// timestamp will be interpreted as though it were
/// `1997-01-31T09:26:56.123-05:00`
///
/// Some formats that supported by PostgresSql <https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
/// still not supported by chrono, like
/// "2023-01-01 040506 America/Los_Angeles",
/// "2023-01-01 04:05:06.789 +07:30:00",
/// "2023-01-01 040506 +07:30:00",
/// "2023-01-01 04:05:06.789 PST",
/// "2023-01-01 04:05:06.789 -08",
#[inline]
pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
// Fast path: RFC3339 timestamp (with a T)
Expand All @@ -81,10 +89,15 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
// separating the date and time with a space ' ' rather than 'T' to be
// (more) compatible with Apache Spark SQL

// timezone offset, using ' ' as a separator
// Example: 2020-09-08 13:42:29.190855-05:00
if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
return to_timestamp_nanos(ts.naive_utc());
let supported_formats = vec![
"%Y-%m-%d %H:%M:%S%.f%:z", // Example: 2020-09-08 13:42:29.190855-05:00
"%Y-%m-%d %H%M%S%.3f%:z", // Example: "2023-01-01 040506 +07:30"
];

for f in supported_formats.iter() {
if let Ok(ts) = DateTime::parse_from_str(s, f) {
return to_timestamp_nanos(ts.naive_utc());
}
}

// with an explicit Z, using ' ' as a separator
Expand Down
4 changes: 2 additions & 2 deletions arrow/tests/array_cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ fn test_can_cast_types() {

/// Create instances of arrays with varying types for cast tests
fn get_arrays_of_all_types() -> Vec<ArrayRef> {
let tz_name = String::from("America/New_York");
let tz_name = String::from("+08:00");
let binary_data: Vec<&[u8]> = vec![b"foo", b"bar"];
vec![
Arc::new(BinaryArray::from(binary_data.clone())),
Expand Down Expand Up @@ -349,7 +349,7 @@ fn create_decimal_array(
// Get a selection of datatypes to try and cast to
fn get_all_types() -> Vec<DataType> {
use DataType::*;
let tz_name = String::from("America/New_York");
let tz_name = String::from("+08:00");

let mut types = vec![
Null,
Expand Down