From 8ebc7237944c56342e1c0599c288a9d0149d701a Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Wed, 11 Jan 2023 16:52:35 -0800 Subject: [PATCH 01/22] Use array_value_to_string in arrow-csv --- arrow-csv/src/writer.rs | 228 ++++------------------------------------ 1 file changed, 22 insertions(+), 206 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index bc11eef2fcf1..7b7534058479 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -63,20 +63,14 @@ //! } //! ``` -use arrow_array::timezone::Tz; use arrow_array::types::*; use arrow_array::*; -use arrow_cast::display::{lexical_to_string, make_string_from_decimal}; +use arrow_cast::display::{array_value_to_string, lexical_to_string}; use arrow_schema::*; -use chrono::{DateTime, Utc}; use std::io::Write; use crate::map_csv_error; -const DEFAULT_DATE_FORMAT: &str = "%F"; -const DEFAULT_TIME_FORMAT: &str = "%T"; -const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; -const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; fn write_primitive_value(array: &ArrayRef, i: usize) -> String @@ -114,18 +108,6 @@ pub struct Writer { writer: csv::Writer, /// Whether file should be written with headers. Defaults to `true` has_headers: bool, - /// The date format for date arrays - date_format: String, - /// The datetime format for datetime arrays - datetime_format: String, - /// The timestamp format for timestamp arrays - #[allow(dead_code)] - timestamp_format: String, - /// The timestamp format for timestamp (with timezone) arrays - #[allow(dead_code)] - timestamp_tz_format: String, - /// The time format for time arrays - time_format: String, /// Is the beginning-of-writer beginning: bool, /// The value to represent null entries @@ -141,11 +123,6 @@ impl Writer { Writer { writer, has_headers: true, - date_format: DEFAULT_DATE_FORMAT.to_string(), - datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - time_format: DEFAULT_TIME_FORMAT.to_string(), - timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - timestamp_tz_format: DEFAULT_TIMESTAMP_TZ_FORMAT.to_string(), beginning: true, null_value: DEFAULT_NULL_VALUE.to_string(), } @@ -177,88 +154,29 @@ impl Writer { DataType::UInt16 => write_primitive_value::(col, row_index), DataType::UInt32 => write_primitive_value::(col, row_index), DataType::UInt64 => write_primitive_value::(col, row_index), - DataType::Boolean => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_string() - } - DataType::Utf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::LargeUtf8 => { - let c = col.as_any().downcast_ref::().unwrap(); - c.value(row_index).to_owned() - } - DataType::Date32 => { - write_temporal_value!( - col, - Date32Array, - &self.date_format, - col_index, - row_index, - value_as_date, - "Date32" - ) - } - DataType::Date64 => { - write_temporal_value!( - col, - Date64Array, - &self.datetime_format, - col_index, - row_index, - value_as_datetime, - "Date64" - ) - } + DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), + DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), + DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), + DataType::Date32 => array_value_to_string(col, row_index)?.to_string(), + DataType::Date64 => array_value_to_string(col, row_index)?.to_string(), DataType::Time32(TimeUnit::Second) => { - write_temporal_value!( - col, - Time32SecondArray, - &self.time_format, - col_index, - row_index, - value_as_time, - "Time32" - ) + array_value_to_string(col, row_index)?.to_string() } DataType::Time32(TimeUnit::Millisecond) => { - write_temporal_value!( - col, - Time32MillisecondArray, - &self.time_format, - col_index, - row_index, - value_as_time, - "Time32" - ) + array_value_to_string(col, row_index)?.to_string() } DataType::Time64(TimeUnit::Microsecond) => { - write_temporal_value!( - col, - Time64MicrosecondArray, - &self.time_format, - col_index, - row_index, - value_as_time, - "Time64" - ) + array_value_to_string(col, row_index)?.to_string() } DataType::Time64(TimeUnit::Nanosecond) => { - write_temporal_value!( - col, - Time64NanosecondArray, - &self.time_format, - col_index, - row_index, - value_as_time, - "Time64" - ) + array_value_to_string(col, row_index)?.to_string() + } + DataType::Timestamp(_, _) => { + array_value_to_string(col, row_index)?.to_string() } - DataType::Timestamp(time_unit, time_zone) => { - self.handle_timestamp(time_unit, time_zone.as_ref(), row_index, col)? + DataType::Decimal128(..) => { + array_value_to_string(col, row_index)?.to_string() } - DataType::Decimal128(..) => make_string_from_decimal(col, row_index)?, t => { // List and Struct arrays not supported by the writer, any // other type needs to be implemented @@ -272,52 +190,6 @@ impl Writer { Ok(()) } - fn handle_timestamp( - &self, - time_unit: &TimeUnit, - time_zone: Option<&String>, - row_index: usize, - col: &ArrayRef, - ) -> Result { - use TimeUnit::*; - let datetime = match time_unit { - Second => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Millisecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Microsecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - Nanosecond => col - .as_any() - .downcast_ref::() - .unwrap() - .value_as_datetime(row_index) - .unwrap(), - }; - - let tz: Option = time_zone.map(|x| x.parse()).transpose()?; - match tz { - Some(tz) => { - let utc_time = DateTime::::from_utc(datetime, Utc); - let local_time = utc_time.with_timezone(&tz); - Ok(local_time.format(&self.timestamp_tz_format).to_string()) - } - None => Ok(datetime.format(&self.timestamp_format).to_string()), - } - } - /// Write a vector of record batches to a writable object pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { let num_columns = batch.num_columns(); @@ -367,16 +239,6 @@ pub struct WriterBuilder { delimiter: Option, /// Whether to write column names as file headers. Defaults to `true` has_headers: bool, - /// Optional date format for date arrays - date_format: Option, - /// Optional datetime format for datetime arrays - datetime_format: Option, - /// Optional timestamp format for timestamp arrays - timestamp_format: Option, - /// Optional timestamp format for timestamp with timezone arrays - timestamp_tz_format: Option, - /// Optional time format for time arrays - time_format: Option, /// Optional value to represent null null_value: Option, } @@ -386,11 +248,6 @@ impl Default for WriterBuilder { Self { has_headers: true, delimiter: None, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), null_value: Some(DEFAULT_NULL_VALUE.to_string()), } } @@ -433,30 +290,6 @@ impl WriterBuilder { self } - /// Set the CSV file's date format - pub fn with_date_format(mut self, format: String) -> Self { - self.date_format = Some(format); - self - } - - /// Set the CSV file's datetime format - pub fn with_datetime_format(mut self, format: String) -> Self { - self.datetime_format = Some(format); - self - } - - /// Set the CSV file's time format - pub fn with_time_format(mut self, format: String) -> Self { - self.time_format = Some(format); - self - } - - /// Set the CSV file's timestamp format - pub fn with_timestamp_format(mut self, format: String) -> Self { - self.timestamp_format = Some(format); - self - } - /// Set the value to represent null in output pub fn with_null(mut self, null_value: String) -> Self { self.null_value = Some(null_value); @@ -471,21 +304,6 @@ impl WriterBuilder { Writer { writer, has_headers: self.has_headers, - date_format: self - .date_format - .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()), - datetime_format: self - .datetime_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: self - .time_format - .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: self - .timestamp_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: self - .timestamp_tz_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, null_value: self .null_value @@ -498,8 +316,7 @@ impl WriterBuilder { mod tests { use super::*; - use crate::Reader; - use std::io::{Cursor, Read, Seek}; + use std::io::{Read, Seek}; use std::sync::Arc; #[test] @@ -569,11 +386,11 @@ mod tests { let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo "#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -619,8 +436,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let builder = WriterBuilder::new() .has_headers(false) .with_delimiter(b'|') - .with_null("NULL".to_string()) - .with_time_format("%r".to_string()); + .with_null("NULL".to_string()); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { @@ -634,7 +450,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" + "Lorem ipsum dolor sit amet|123.564532|3|true|00:20:34\nconsectetur adipiscing elit|NULL|2|false|06:51:20\nsed do eiusmod tempor|-556132.25|1|NULL|23:46:03\n" .to_string(), String::from_utf8(buffer).unwrap() ); From 5d891953383436ff10dfad6a7dcd4609bce254db Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Wed, 11 Jan 2023 17:18:09 -0800 Subject: [PATCH 02/22] Fix test --- arrow/tests/csv.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 83a279ce4794..10b4cefbb64b 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -57,8 +57,8 @@ fn test_export_csv_timestamps() { drop(writer); let left = "c1,c2 -2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 -2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; +2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 +2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } From 20cf37a5bdbf792af111cace7534aebdd6760fd8 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Mon, 16 Jan 2023 13:43:58 -0800 Subject: [PATCH 03/22] Add datetime_array_value_to_string to allow passing datetime format --- arrow-cast/src/display.rs | 45 +++++++++++++-- arrow-csv/src/writer.rs | 112 +++++++++++++++++++++++++++++++------- 2 files changed, 132 insertions(+), 25 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 16fbfb0bbce5..ec2a06684b43 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -184,6 +184,17 @@ macro_rules! make_string_datetime_with_tz { }}; } +macro_rules! make_string_datetime_with_format { + ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + Ok(array + .value_as_datetime($row) + .map(|d| d.format($format).to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + }}; +} + // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex macro_rules! make_string_hex { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -323,9 +334,10 @@ fn append_map_field_string( /// /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. -pub fn array_value_to_string( +fn array_value_to_string_internal( column: &ArrayRef, row: usize, + datetime_format: Option<&String> ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -352,14 +364,22 @@ pub fn array_value_to_string( DataType::Float64 => make_string!(array::Float64Array, column, row), DataType::Decimal128(..) => make_string_from_decimal(column, row), DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( + match datetime_format { + Some(format) => make_string_datetime_with_format!( array::TimestampSecondArray, - tz_string, + format, column, row ), - None => make_string_datetime!(array::TimestampSecondArray, column, row), + None => match tz_string_opt { + Some(tz_string) => make_string_datetime_with_tz!( + array::TimestampSecondArray, + tz_string, + column, + row + ), + None => make_string_datetime!(array::TimestampSecondArray, column, row), + }, } } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { @@ -524,6 +544,21 @@ pub fn array_value_to_string( } } +pub fn array_value_to_string( + column: &ArrayRef, + row: usize, +) -> Result { + array_value_to_string_internal(column, row, None) +} + +pub fn datetime_array_value_to_string( + column: &ArrayRef, + row: usize, + format: &String, +) -> Result { + array_value_to_string_internal(column, row, Some(format)) +} + /// Converts the value of the union array at `row` to a String fn union_to_string( column: &ArrayRef, diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 7b7534058479..9e5bc7c98d59 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -65,12 +65,16 @@ use arrow_array::types::*; use arrow_array::*; -use arrow_cast::display::{array_value_to_string, lexical_to_string}; +use arrow_cast::display::{lexical_to_string, array_value_to_string, datetime_array_value_to_string}; use arrow_schema::*; use std::io::Write; use crate::map_csv_error; +const DEFAULT_DATE_FORMAT: &str = "%F"; +const DEFAULT_TIME_FORMAT: &str = "%T"; +const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; +const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; fn write_primitive_value(array: &ArrayRef, i: usize) -> String @@ -108,6 +112,18 @@ pub struct Writer { writer: csv::Writer, /// Whether file should be written with headers. Defaults to `true` has_headers: bool, + /// The date format for date arrays + date_format: String, + /// The datetime format for datetime arrays + datetime_format: String, + /// The timestamp format for timestamp arrays + #[allow(dead_code)] + timestamp_format: String, + /// The timestamp format for timestamp (with timezone) arrays + #[allow(dead_code)] + timestamp_tz_format: String, + /// The time format for time arrays + time_format: String, /// Is the beginning-of-writer beginning: bool, /// The value to represent null entries @@ -123,6 +139,11 @@ impl Writer { Writer { writer, has_headers: true, + date_format: DEFAULT_DATE_FORMAT.to_string(), + datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), + time_format: DEFAULT_TIME_FORMAT.to_string(), + timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), + timestamp_tz_format: DEFAULT_TIMESTAMP_TZ_FORMAT.to_string(), beginning: true, null_value: DEFAULT_NULL_VALUE.to_string(), } @@ -157,26 +178,23 @@ impl Writer { DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date32 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date64 => array_value_to_string(col, row_index)?.to_string(), - DataType::Time32(TimeUnit::Second) => { - array_value_to_string(col, row_index)?.to_string() - } - DataType::Time32(TimeUnit::Millisecond) => { - array_value_to_string(col, row_index)?.to_string() - } - DataType::Time64(TimeUnit::Microsecond) => { - array_value_to_string(col, row_index)?.to_string() - } - DataType::Time64(TimeUnit::Nanosecond) => { - array_value_to_string(col, row_index)?.to_string() - } - DataType::Timestamp(_, _) => { - array_value_to_string(col, row_index)?.to_string() - } - DataType::Decimal128(..) => { - array_value_to_string(col, row_index)?.to_string() + DataType::Date32 => datetime_array_value_to_string(col, row_index, &self.date_format)?.to_string(), + DataType::Date64 => datetime_array_value_to_string(col, row_index, &self.datetime_format)?.to_string(), + DataType::Time32(TimeUnit::Second) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), + DataType::Time32(TimeUnit::Millisecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), + DataType::Time64(TimeUnit::Microsecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), + DataType::Time64(TimeUnit::Nanosecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), + DataType::Timestamp(_, time_zone) => { + match time_zone { + Some(_tz) => { + datetime_array_value_to_string(col, row_index, &self.timestamp_tz_format)?.to_string() + }, + None => { + datetime_array_value_to_string(col, row_index, &self.timestamp_format)?.to_string() + } + } } + DataType::Decimal128(..) => array_value_to_string(col, row_index)?.to_string(), t => { // List and Struct arrays not supported by the writer, any // other type needs to be implemented @@ -239,6 +257,16 @@ pub struct WriterBuilder { delimiter: Option, /// Whether to write column names as file headers. Defaults to `true` has_headers: bool, + /// Optional date format for date arrays + date_format: Option, + /// Optional datetime format for datetime arrays + datetime_format: Option, + /// Optional timestamp format for timestamp arrays + timestamp_format: Option, + /// Optional timestamp format for timestamp with timezone arrays + timestamp_tz_format: Option, + /// Optional time format for time arrays + time_format: Option, /// Optional value to represent null null_value: Option, } @@ -248,6 +276,11 @@ impl Default for WriterBuilder { Self { has_headers: true, delimiter: None, + date_format: Some(DEFAULT_DATE_FORMAT.to_string()), + datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + time_format: Some(DEFAULT_TIME_FORMAT.to_string()), + timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), null_value: Some(DEFAULT_NULL_VALUE.to_string()), } } @@ -290,6 +323,30 @@ impl WriterBuilder { self } + /// Set the CSV file's date format + pub fn with_date_format(mut self, format: String) -> Self { + self.date_format = Some(format); + self + } + + /// Set the CSV file's datetime format + pub fn with_datetime_format(mut self, format: String) -> Self { + self.datetime_format = Some(format); + self + } + + /// Set the CSV file's time format + pub fn with_time_format(mut self, format: String) -> Self { + self.time_format = Some(format); + self + } + + /// Set the CSV file's timestamp format + pub fn with_timestamp_format(mut self, format: String) -> Self { + self.timestamp_format = Some(format); + self + } + /// Set the value to represent null in output pub fn with_null(mut self, null_value: String) -> Self { self.null_value = Some(null_value); @@ -304,6 +361,21 @@ impl WriterBuilder { Writer { writer, has_headers: self.has_headers, + date_format: self + .date_format + .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()), + datetime_format: self + .datetime_format + .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), + time_format: self + .time_format + .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()), + timestamp_format: self + .timestamp_format + .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), + timestamp_tz_format: self + .timestamp_tz_format + .unwrap_or_else(|| DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, null_value: self .null_value From 5a2fb4d98a8ec2a74311be6ee9f4d2e072188803 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Tue, 17 Jan 2023 18:56:02 -0800 Subject: [PATCH 04/22] Rollback the tests --- arrow-cast/src/display.rs | 159 ++++++++++++++++++++++---------------- arrow-csv/src/writer.rs | 109 ++------------------------ 2 files changed, 101 insertions(+), 167 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index ec2a06684b43..14452122023a 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -143,6 +143,26 @@ macro_rules! make_string_date { }}; } +macro_rules! make_string_date_with_format { + ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + + Ok(array + .value_as_date($row) + .map(|d| d.format($format).to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + }}; +} + +macro_rules! handle_string_date { + ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + match $format { + Some(format) => make_string_date_with_format!($array_type, format, $column, $row), + None => make_string_date!($array_type, $column, $row) + } + }}; +} + macro_rules! make_string_time { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); @@ -154,6 +174,27 @@ macro_rules! make_string_time { }}; } +macro_rules! make_string_time_with_format { + ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + Ok(array + .value_as_time($row) + .map(|d| d.format($format).to_string()) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + }}; +} + +macro_rules! handle_string_time { + ($array_type:ty, $format: ident, $column: ident, $row: ident) => { + match $format { + Some(format) => { + make_string_time_with_format!($array_type, format, $column, $row) + } + None => make_string_time!($array_type, $column, $row) + } + } +} + macro_rules! make_string_datetime { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); @@ -195,6 +236,22 @@ macro_rules! make_string_datetime_with_format { }}; } +macro_rules! handle_string_datetime { + ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => { + match $format { + Some(format) => { + make_string_datetime_with_format!($array_type, format, $column, $row) + } + None => match $tz_string { + Some(tz_string) => { + make_string_datetime_with_tz!($array_type, tz_string, $column, $row) + } + None => make_string_datetime!($array_type, $column, $row), + }, + } + } +} + // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex macro_rules! make_string_hex { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -269,6 +326,8 @@ macro_rules! make_string_from_duration { }}; } + + #[inline(always)] pub fn make_string_from_decimal( column: &Arc, @@ -337,7 +396,7 @@ fn append_map_field_string( fn array_value_to_string_internal( column: &ArrayRef, row: usize, - datetime_format: Option<&String> + datetime_format_opt: Option<&String> ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -363,77 +422,47 @@ fn array_value_to_string_internal( DataType::Float32 => make_string!(array::Float32Array, column, row), DataType::Float64 => make_string!(array::Float64Array, column, row), DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - match datetime_format { - Some(format) => make_string_datetime_with_format!( - array::TimestampSecondArray, - format, - column, - row - ), - None => match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampSecondArray, - tz_string, - column, - row - ), - None => make_string_datetime!(array::TimestampSecondArray, column, row), - }, - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMillisecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMillisecondArray, column, row) - } - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampMicrosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampMicrosecondArray, column, row) - } - } - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { - match tz_string_opt { - Some(tz_string) => make_string_datetime_with_tz!( - array::TimestampNanosecondArray, - tz_string, - column, - row - ), - None => { - make_string_datetime!(array::TimestampNanosecondArray, column, row) - } - } - } - DataType::Date32 => make_string_date!(array::Date32Array, column, row), - DataType::Date64 => make_string_date!(array::Date64Array, column, row), + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => handle_string_datetime!( + array::TimestampSecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ), + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => handle_string_datetime!( + array::TimestampMillisecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ), + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => handle_string_datetime!( + array::TimestampMicrosecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ), + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => handle_string_datetime!( + array::TimestampNanosecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ), + DataType::Date32 => handle_string_date!(array::Date32Array, datetime_format_opt, column, row), + DataType::Date64 => handle_string_date!(array::Date64Array, datetime_format_opt, column, row), DataType::Time32(unit) if *unit == TimeUnit::Second => { - make_string_time!(array::Time32SecondArray, column, row) + handle_string_time!(array::Time32SecondArray, datetime_format_opt, column, row) } DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - make_string_time!(array::Time32MillisecondArray, column, row) + handle_string_time!(array::Time32MillisecondArray, datetime_format_opt, column, row) } DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - make_string_time!(array::Time64MicrosecondArray, column, row) + handle_string_time!(array::Time64MicrosecondArray, datetime_format_opt, column, row) } DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - make_string_time!(array::Time64NanosecondArray, column, row) + handle_string_time!(array::Time64NanosecondArray, datetime_format_opt, column, row) } DataType::Interval(unit) => match unit { IntervalUnit::DayTime => { diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 9e5bc7c98d59..3e013c980111 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -458,11 +458,11 @@ mod tests { let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo "#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -508,7 +508,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let builder = WriterBuilder::new() .has_headers(false) .with_delimiter(b'|') - .with_null("NULL".to_string()); + .with_null("NULL".to_string()) + .with_time_format("%r".to_string()); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { @@ -522,105 +523,9 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|00:20:34\nconsectetur adipiscing elit|NULL|2|false|06:51:20\nsed do eiusmod tempor|-556132.25|1|NULL|23:46:03\n" + "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" .to_string(), String::from_utf8(buffer).unwrap() ); } - - #[test] - fn test_conversion_consistency() { - // test if we can serialize and deserialize whilst retaining the same type information/ precision - - let schema = Schema::new(vec![ - Field::new("c1", DataType::Date32, false), - Field::new("c2", DataType::Date64, false), - Field::new("c3", DataType::Timestamp(TimeUnit::Nanosecond, None), false), - ]); - - let nanoseconds = vec![ - 1599566300000000000, - 1599566200000000000, - 1599566100000000000, - ]; - let c1 = Date32Array::from(vec![3, 2, 1]); - let c2 = Date64Array::from(vec![3, 2, 1]); - let c3 = TimestampNanosecondArray::from(nanoseconds.clone()); - - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(c1), Arc::new(c2), Arc::new(c3)], - ) - .unwrap(); - - let builder = WriterBuilder::new().has_headers(false); - - let mut buf: Cursor> = Default::default(); - // drop the writer early to release the borrow. - { - let mut writer = builder.build(&mut buf); - writer.write(&batch).unwrap(); - } - buf.set_position(0); - - let mut reader = Reader::new( - buf, - Arc::new(schema), - false, - None, - 3, - // starting at row 2 and up to row 6. - None, - None, - None, - ); - let rb = reader.next().unwrap().unwrap(); - let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); - let c2 = rb.column(1).as_any().downcast_ref::().unwrap(); - let c3 = rb - .column(2) - .as_any() - .downcast_ref::() - .unwrap(); - - let actual = c1.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - let actual = c2.into_iter().collect::>(); - let expected = vec![Some(3), Some(2), Some(1)]; - assert_eq!(actual, expected); - let actual = c3.into_iter().collect::>(); - let expected = nanoseconds.into_iter().map(Some).collect::>(); - assert_eq!(actual, expected); - } - - #[test] - fn test_write_csv_invalid_cast() { - let schema = Schema::new(vec![ - Field::new("c0", DataType::UInt32, false), - Field::new("c1", DataType::Date64, false), - ]); - - let c0 = UInt32Array::from(vec![Some(123), Some(234)]); - let c1 = Date64Array::from(vec![Some(1926632005177), Some(1926632005177685347)]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]) - .unwrap(); - - let mut file = tempfile::tempfile().unwrap(); - let mut writer = Writer::new(&mut file); - let batches = vec![&batch, &batch]; - for batch in batches { - writer - .write(batch) - .map_err(|e| { - dbg!(e.to_string()); - assert!(e.to_string().ends_with( - invalid_cast_error("Date64", 1, 1).to_string().as_str() - )) - }) - .unwrap_err(); - } - drop(writer); - } } From 91cfb7f5b02ec91defb854ebed33e2dd9af2ae38 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Tue, 17 Jan 2023 19:45:40 -0800 Subject: [PATCH 05/22] Add option to use RFC3339 in CSV writeR --- arrow-cast/src/display.rs | 8 ++-- arrow-csv/src/writer.rs | 84 ++++++++++++++++++++++++--------------- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 14452122023a..996b2897a9ee 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -396,7 +396,7 @@ fn append_map_field_string( fn array_value_to_string_internal( column: &ArrayRef, row: usize, - datetime_format_opt: Option<&String> + datetime_format_opt: &Option ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -577,15 +577,15 @@ pub fn array_value_to_string( column: &ArrayRef, row: usize, ) -> Result { - array_value_to_string_internal(column, row, None) + array_value_to_string_internal(column, row, &None) } pub fn datetime_array_value_to_string( column: &ArrayRef, row: usize, - format: &String, + format: &Option, ) -> Result { - array_value_to_string_internal(column, row, Some(format)) + array_value_to_string_internal(column, row, format) } /// Converts the value of the union array at `row` to a String diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 3e013c980111..42894e25c4a4 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -113,21 +113,23 @@ pub struct Writer { /// Whether file should be written with headers. Defaults to `true` has_headers: bool, /// The date format for date arrays - date_format: String, + date_format: Option, /// The datetime format for datetime arrays - datetime_format: String, + datetime_format: Option, /// The timestamp format for timestamp arrays #[allow(dead_code)] - timestamp_format: String, + timestamp_format: Option, /// The timestamp format for timestamp (with timezone) arrays #[allow(dead_code)] - timestamp_tz_format: String, + timestamp_tz_format: Option, /// The time format for time arrays - time_format: String, + time_format: Option, /// Is the beginning-of-writer beginning: bool, /// The value to represent null entries null_value: String, + /// Is using RFC3339 format for date/time/timestamps + use_rfc3339: bool } impl Writer { @@ -139,13 +141,14 @@ impl Writer { Writer { writer, has_headers: true, - date_format: DEFAULT_DATE_FORMAT.to_string(), - datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - time_format: DEFAULT_TIME_FORMAT.to_string(), - timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(), - timestamp_tz_format: DEFAULT_TIMESTAMP_TZ_FORMAT.to_string(), + date_format: Some(DEFAULT_DATE_FORMAT.to_string()), + datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + time_format: Some(DEFAULT_TIME_FORMAT.to_string()), + timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), + timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, null_value: DEFAULT_NULL_VALUE.to_string(), + use_rfc3339: false } } @@ -269,6 +272,8 @@ pub struct WriterBuilder { time_format: Option, /// Optional value to represent null null_value: Option, + /// Whether to use RFC3339 format for timestamps. Defaults to `false` + use_rfc3339: bool } impl Default for WriterBuilder { @@ -282,6 +287,7 @@ impl Default for WriterBuilder { timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), null_value: Some(DEFAULT_NULL_VALUE.to_string()), + use_rfc3339: false } } } @@ -353,33 +359,47 @@ impl WriterBuilder { self } + /// Whether to use RFC3339 format for date/time/timestamps + pub fn with_rfc3339_format(mut self, use_rfc3339: bool) -> Self { + self.use_rfc3339 = use_rfc3339; + self + } + /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { let delimiter = self.delimiter.unwrap_or(b','); let mut builder = csv::WriterBuilder::new(); let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - has_headers: self.has_headers, - date_format: self - .date_format - .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()), - datetime_format: self - .datetime_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: self - .time_format - .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: self - .timestamp_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: self - .timestamp_tz_format - .unwrap_or_else(|| DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + if self.use_rfc3339 { + Writer { + writer, + has_headers: self.has_headers, + date_format: None, + datetime_format: None, + time_format: None, + timestamp_format: None, + timestamp_tz_format: None, + beginning: true, + null_value: self + .null_value + .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + use_rfc3339: self.use_rfc3339 + } + } else { + Writer { + writer, + has_headers: self.has_headers, + date_format: self.date_format, + datetime_format: self.datetime_format, + time_format: self.time_format, + timestamp_format: self.timestamp_format, + timestamp_tz_format: self.timestamp_tz_format, + beginning: true, + null_value: self + .null_value + .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + use_rfc3339: self.use_rfc3339 + } } } } From 44aff40c381bb434a5b72e3787e041f7d45c5a63 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Tue, 17 Jan 2023 20:11:08 -0800 Subject: [PATCH 06/22] Update tests --- arrow-csv/src/writer.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 42894e25c4a4..6a7334f4ec61 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -128,8 +128,6 @@ pub struct Writer { beginning: bool, /// The value to represent null entries null_value: String, - /// Is using RFC3339 format for date/time/timestamps - use_rfc3339: bool } impl Writer { @@ -147,8 +145,7 @@ impl Writer { timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, - null_value: DEFAULT_NULL_VALUE.to_string(), - use_rfc3339: false + null_value: DEFAULT_NULL_VALUE.to_string() } } @@ -360,7 +357,7 @@ impl WriterBuilder { } /// Whether to use RFC3339 format for date/time/timestamps - pub fn with_rfc3339_format(mut self, use_rfc3339: bool) -> Self { + pub fn with_rfc3339(mut self, use_rfc3339: bool) -> Self { self.use_rfc3339 = use_rfc3339; self } @@ -383,7 +380,6 @@ impl WriterBuilder { null_value: self .null_value .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), - use_rfc3339: self.use_rfc3339 } } else { Writer { @@ -398,7 +394,6 @@ impl WriterBuilder { null_value: self .null_value .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), - use_rfc3339: self.use_rfc3339 } } } @@ -529,7 +524,8 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo .has_headers(false) .with_delimiter(b'|') .with_null("NULL".to_string()) - .with_time_format("%r".to_string()); + .with_time_format("%r".to_string()) + .with_rfc3339(true); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { @@ -543,7 +539,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" + "Lorem ipsum dolor sit amet|123.564532|3|true|00:20:34\nconsectetur adipiscing elit|NULL|2|false|06:51:20\nsed do eiusmod tempor|-556132.25|1|NULL|23:46:03\n" .to_string(), String::from_utf8(buffer).unwrap() ); From 779a68175fca03609eaadca65e71de66f43fc9a2 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Tue, 17 Jan 2023 21:05:36 -0800 Subject: [PATCH 07/22] Fix linting errors --- arrow-cast/src/display.rs | 118 ++++++++++++++++++++++++-------------- arrow-csv/src/writer.rs | 66 ++++++++++++++------- 2 files changed, 121 insertions(+), 63 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 996b2897a9ee..d0d4dc5fd1c9 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -157,8 +157,10 @@ macro_rules! make_string_date_with_format { macro_rules! handle_string_date { ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ match $format { - Some(format) => make_string_date_with_format!($array_type, format, $column, $row), - None => make_string_date!($array_type, $column, $row) + Some(format) => { + make_string_date_with_format!($array_type, format, $column, $row) + } + None => make_string_date!($array_type, $column, $row), } }}; } @@ -190,9 +192,9 @@ macro_rules! handle_string_time { Some(format) => { make_string_time_with_format!($array_type, format, $column, $row) } - None => make_string_time!($array_type, $column, $row) + None => make_string_time!($array_type, $column, $row), } - } + }; } macro_rules! make_string_datetime { @@ -249,7 +251,7 @@ macro_rules! handle_string_datetime { None => make_string_datetime!($array_type, $column, $row), }, } - } + }; } // It's not possible to do array.value($row).to_string() for &[u8], let's format it as hex @@ -326,8 +328,6 @@ macro_rules! make_string_from_duration { }}; } - - #[inline(always)] pub fn make_string_from_decimal( column: &Arc, @@ -396,7 +396,7 @@ fn append_map_field_string( fn array_value_to_string_internal( column: &ArrayRef, row: usize, - datetime_format_opt: &Option + datetime_format_opt: &Option, ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -422,47 +422,79 @@ fn array_value_to_string_internal( DataType::Float32 => make_string!(array::Float32Array, column, row), DataType::Float64 => make_string!(array::Float64Array, column, row), DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => handle_string_datetime!( - array::TimestampSecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => handle_string_datetime!( - array::TimestampMillisecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => handle_string_datetime!( - array::TimestampMicrosecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => handle_string_datetime!( - array::TimestampNanosecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ), - DataType::Date32 => handle_string_date!(array::Date32Array, datetime_format_opt, column, row), - DataType::Date64 => handle_string_date!(array::Date64Array, datetime_format_opt, column, row), + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { + handle_string_datetime!( + array::TimestampSecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { + handle_string_datetime!( + array::TimestampMillisecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { + handle_string_datetime!( + array::TimestampMicrosecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { + handle_string_datetime!( + array::TimestampNanosecondArray, + datetime_format_opt, + tz_string_opt, + column, + row + ) + } + DataType::Date32 => { + handle_string_date!(array::Date32Array, datetime_format_opt, column, row) + } + DataType::Date64 => { + handle_string_date!(array::Date64Array, datetime_format_opt, column, row) + } DataType::Time32(unit) if *unit == TimeUnit::Second => { - handle_string_time!(array::Time32SecondArray, datetime_format_opt, column, row) + handle_string_time!( + array::Time32SecondArray, + datetime_format_opt, + column, + row + ) } DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - handle_string_time!(array::Time32MillisecondArray, datetime_format_opt, column, row) + handle_string_time!( + array::Time32MillisecondArray, + datetime_format_opt, + column, + row + ) } DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - handle_string_time!(array::Time64MicrosecondArray, datetime_format_opt, column, row) + handle_string_time!( + array::Time64MicrosecondArray, + datetime_format_opt, + column, + row + ) } DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - handle_string_time!(array::Time64NanosecondArray, datetime_format_opt, column, row) + handle_string_time!( + array::Time64NanosecondArray, + datetime_format_opt, + column, + row + ) } DataType::Interval(unit) => match unit { IntervalUnit::DayTime => { @@ -577,7 +609,7 @@ pub fn array_value_to_string( column: &ArrayRef, row: usize, ) -> Result { - array_value_to_string_internal(column, row, &None) + array_value_to_string_internal(column, row, &None) } pub fn datetime_array_value_to_string( diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 6a7334f4ec61..4e41062f07c2 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -65,7 +65,9 @@ use arrow_array::types::*; use arrow_array::*; -use arrow_cast::display::{lexical_to_string, array_value_to_string, datetime_array_value_to_string}; +use arrow_cast::display::{ + array_value_to_string, datetime_array_value_to_string, lexical_to_string, +}; use arrow_schema::*; use std::io::Write; @@ -145,7 +147,7 @@ impl Writer { timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), beginning: true, - null_value: DEFAULT_NULL_VALUE.to_string() + null_value: DEFAULT_NULL_VALUE.to_string(), } } @@ -178,23 +180,47 @@ impl Writer { DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date32 => datetime_array_value_to_string(col, row_index, &self.date_format)?.to_string(), - DataType::Date64 => datetime_array_value_to_string(col, row_index, &self.datetime_format)?.to_string(), - DataType::Time32(TimeUnit::Second) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), - DataType::Time32(TimeUnit::Millisecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), - DataType::Time64(TimeUnit::Microsecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), - DataType::Time64(TimeUnit::Nanosecond) => datetime_array_value_to_string(col, row_index, &self.time_format)?.to_string(), - DataType::Timestamp(_, time_zone) => { - match time_zone { - Some(_tz) => { - datetime_array_value_to_string(col, row_index, &self.timestamp_tz_format)?.to_string() - }, - None => { - datetime_array_value_to_string(col, row_index, &self.timestamp_format)?.to_string() - } - } + DataType::Date32 => { + datetime_array_value_to_string(col, row_index, &self.date_format)? + .to_string() + } + DataType::Date64 => { + datetime_array_value_to_string(col, row_index, &self.datetime_format)? + .to_string() + } + DataType::Time32(TimeUnit::Second) => { + datetime_array_value_to_string(col, row_index, &self.time_format)? + .to_string() + } + DataType::Time32(TimeUnit::Millisecond) => { + datetime_array_value_to_string(col, row_index, &self.time_format)? + .to_string() + } + DataType::Time64(TimeUnit::Microsecond) => { + datetime_array_value_to_string(col, row_index, &self.time_format)? + .to_string() + } + DataType::Time64(TimeUnit::Nanosecond) => { + datetime_array_value_to_string(col, row_index, &self.time_format)? + .to_string() + } + DataType::Timestamp(_, time_zone) => match time_zone { + Some(_tz) => datetime_array_value_to_string( + col, + row_index, + &self.timestamp_tz_format, + )? + .to_string(), + None => datetime_array_value_to_string( + col, + row_index, + &self.timestamp_format, + )? + .to_string(), + }, + DataType::Decimal128(..) => { + array_value_to_string(col, row_index)?.to_string() } - DataType::Decimal128(..) => array_value_to_string(col, row_index)?.to_string(), t => { // List and Struct arrays not supported by the writer, any // other type needs to be implemented @@ -270,7 +296,7 @@ pub struct WriterBuilder { /// Optional value to represent null null_value: Option, /// Whether to use RFC3339 format for timestamps. Defaults to `false` - use_rfc3339: bool + use_rfc3339: bool, } impl Default for WriterBuilder { @@ -284,7 +310,7 @@ impl Default for WriterBuilder { timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), null_value: Some(DEFAULT_NULL_VALUE.to_string()), - use_rfc3339: false + use_rfc3339: false, } } } From 5414464240ea2e7baffa206c414a95be0f29ad78 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Tue, 17 Jan 2023 22:50:58 -0800 Subject: [PATCH 08/22] fix tests --- arrow-cast/src/display.rs | 12 ++++++------ arrow/tests/csv.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index d0d4dc5fd1c9..6589d960f906 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -240,13 +240,13 @@ macro_rules! make_string_datetime_with_format { macro_rules! handle_string_datetime { ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => { - match $format { - Some(format) => { - make_string_datetime_with_format!($array_type, format, $column, $row) + match $tz_string { + Some(tz_string) => { + make_string_datetime_with_tz!($array_type, tz_string, $column, $row) } - None => match $tz_string { - Some(tz_string) => { - make_string_datetime_with_tz!($array_type, tz_string, $column, $row) + None => match $format { + Some(format) => { + make_string_datetime_with_format!($array_type, format, $column, $row) } None => make_string_datetime!($array_type, $column, $row), }, diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 10b4cefbb64b..0c2c4a71e221 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -57,8 +57,8 @@ fn test_export_csv_timestamps() { drop(writer); let left = "c1,c2 -2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 -2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; +2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378000000 +2021-10-30T17:59:07+11:00,2021-10-30T06:59:07.000000000\n"; let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } From ee0c7943012e574b133bb4c948de476bb90d9c06 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Sat, 21 Jan 2023 10:42:42 -0800 Subject: [PATCH 09/22] Change with_rfc3339 factory method and use Option<&str> instead of &Option --- arrow-cast/src/display.rs | 6 +- arrow-csv/src/writer.rs | 114 ++++++++++++++++++++------------------ 2 files changed, 62 insertions(+), 58 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 6589d960f906..495bd3878189 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -396,7 +396,7 @@ fn append_map_field_string( fn array_value_to_string_internal( column: &ArrayRef, row: usize, - datetime_format_opt: &Option, + datetime_format_opt: Option<&str>, ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -609,13 +609,13 @@ pub fn array_value_to_string( column: &ArrayRef, row: usize, ) -> Result { - array_value_to_string_internal(column, row, &None) + array_value_to_string_internal(column, row, None) } pub fn datetime_array_value_to_string( column: &ArrayRef, row: usize, - format: &Option, + format: Option<&str>, ) -> Result { array_value_to_string_internal(column, row, format) } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 4e41062f07c2..1831f07648cd 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -180,41 +180,57 @@ impl Writer { DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date32 => { - datetime_array_value_to_string(col, row_index, &self.date_format)? - .to_string() - } - DataType::Date64 => { - datetime_array_value_to_string(col, row_index, &self.datetime_format)? - .to_string() - } - DataType::Time32(TimeUnit::Second) => { - datetime_array_value_to_string(col, row_index, &self.time_format)? - .to_string() - } + DataType::Date32 => datetime_array_value_to_string( + col, + row_index, + self.date_format.as_deref(), + )? + .to_string(), + DataType::Date64 => datetime_array_value_to_string( + col, + row_index, + self.datetime_format.as_deref(), + )? + .to_string(), + DataType::Time32(TimeUnit::Second) => datetime_array_value_to_string( + col, + row_index, + self.time_format.as_deref(), + )? + .to_string(), DataType::Time32(TimeUnit::Millisecond) => { - datetime_array_value_to_string(col, row_index, &self.time_format)? - .to_string() + datetime_array_value_to_string( + col, + row_index, + self.time_format.as_deref(), + )? + .to_string() } DataType::Time64(TimeUnit::Microsecond) => { - datetime_array_value_to_string(col, row_index, &self.time_format)? - .to_string() - } - DataType::Time64(TimeUnit::Nanosecond) => { - datetime_array_value_to_string(col, row_index, &self.time_format)? - .to_string() + datetime_array_value_to_string( + col, + row_index, + self.time_format.as_deref(), + )? + .to_string() } + DataType::Time64(TimeUnit::Nanosecond) => datetime_array_value_to_string( + col, + row_index, + self.time_format.as_deref(), + )? + .to_string(), DataType::Timestamp(_, time_zone) => match time_zone { Some(_tz) => datetime_array_value_to_string( col, row_index, - &self.timestamp_tz_format, + self.timestamp_tz_format.as_deref(), )? .to_string(), None => datetime_array_value_to_string( col, row_index, - &self.timestamp_format, + self.timestamp_format.as_deref(), )? .to_string(), }, @@ -295,8 +311,6 @@ pub struct WriterBuilder { time_format: Option, /// Optional value to represent null null_value: Option, - /// Whether to use RFC3339 format for timestamps. Defaults to `false` - use_rfc3339: bool, } impl Default for WriterBuilder { @@ -310,7 +324,6 @@ impl Default for WriterBuilder { timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), null_value: Some(DEFAULT_NULL_VALUE.to_string()), - use_rfc3339: false, } } } @@ -382,9 +395,16 @@ impl WriterBuilder { self } - /// Whether to use RFC3339 format for date/time/timestamps + /// Use RFC3339 format for date/time/timestamps by clearing all + /// date/time specific formats. pub fn with_rfc3339(mut self, use_rfc3339: bool) -> Self { - self.use_rfc3339 = use_rfc3339; + if use_rfc3339 { + self.date_format = None; + self.datetime_format = None; + self.time_format = None; + self.timestamp_format = None; + self.timestamp_tz_format = None; + } self } @@ -393,34 +413,18 @@ impl WriterBuilder { let delimiter = self.delimiter.unwrap_or(b','); let mut builder = csv::WriterBuilder::new(); let writer = builder.delimiter(delimiter).from_writer(writer); - if self.use_rfc3339 { - Writer { - writer, - has_headers: self.has_headers, - date_format: None, - datetime_format: None, - time_format: None, - timestamp_format: None, - timestamp_tz_format: None, - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), - } - } else { - Writer { - writer, - has_headers: self.has_headers, - date_format: self.date_format, - datetime_format: self.datetime_format, - time_format: self.time_format, - timestamp_format: self.timestamp_format, - timestamp_tz_format: self.timestamp_tz_format, - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), - } + Writer { + writer, + has_headers: self.has_headers, + date_format: self.date_format, + datetime_format: self.datetime_format, + time_format: self.time_format, + timestamp_format: self.timestamp_format, + timestamp_tz_format: self.timestamp_tz_format, + beginning: true, + null_value: self + .null_value + .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), } } } From 5d7a4cedf154476cd72eb7e0af754b1f7c61d2c2 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Sat, 21 Jan 2023 11:56:18 -0800 Subject: [PATCH 10/22] Keep old tests intact --- arrow-cast/src/display.rs | 75 ++++++++++++++++++++++----------------- arrow-csv/src/writer.rs | 5 ++- arrow/tests/csv.rs | 4 +-- 3 files changed, 46 insertions(+), 38 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 495bd3878189..14fa29c14051 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -28,6 +28,7 @@ use arrow_array::*; use arrow_buffer::ArrowNativeType; use arrow_schema::*; use chrono::prelude::SecondsFormat; +use chrono::{DateTime, Utc}; macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ @@ -198,28 +199,25 @@ macro_rules! handle_string_time { } macro_rules! make_string_datetime { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_datetime($row) - .map(|d| format!("{:?}", d)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) - }}; -} - -macro_rules! make_string_datetime_with_tz { ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let s = match $tz_string.parse::() { - Ok(tz) => array - .value_as_datetime_with_tz($row, tz) - .map(|d| format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true))) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - Err(_) => array + let s = match $tz_string { + Some(tz_string) => match tz_string.parse::() { + Ok(tz) => array + .value_as_datetime_with_tz($row, tz) + .map(|d| { + format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true)) + }) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + Err(_) => array + .value_as_datetime($row) + .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, tz_string)) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + }, + None => array .value_as_datetime($row) - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, $tz_string)) + .map(|d| format!("{:?}", d)) .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), }; @@ -228,28 +226,39 @@ macro_rules! make_string_datetime_with_tz { } macro_rules! make_string_datetime_with_format { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + let datetime = array.value_as_datetime($row); + + let s = match $tz_string { + Some(tz_string) => match tz_string.parse::() { + Ok(tz) => { + let utc_time = DateTime::::from_utc(datetime.unwrap(), Utc); + let local_time = utc_time.with_timezone(&tz); + local_time.format($format).to_string() + } + Err(_) => datetime + .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, tz_string)) + .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + }, + None => datetime.unwrap().format($format).to_string(), + }; - Ok(array - .value_as_datetime($row) - .map(|d| d.format($format).to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + Ok(s) }}; } macro_rules! handle_string_datetime { ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => { - match $tz_string { - Some(tz_string) => { - make_string_datetime_with_tz!($array_type, tz_string, $column, $row) - } - None => match $format { - Some(format) => { - make_string_datetime_with_format!($array_type, format, $column, $row) - } - None => make_string_datetime!($array_type, $column, $row), - }, + match $format { + Some(format) => make_string_datetime_with_format!( + $array_type, + format, + $tz_string, + $column, + $row + ), + None => make_string_datetime!($array_type, $tz_string, $column, $row), } }; } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 1831f07648cd..80bd69b67ca2 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -554,8 +554,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo .has_headers(false) .with_delimiter(b'|') .with_null("NULL".to_string()) - .with_time_format("%r".to_string()) - .with_rfc3339(true); + .with_time_format("%r".to_string()); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches { @@ -569,7 +568,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo file.read_to_end(&mut buffer).unwrap(); assert_eq!( - "Lorem ipsum dolor sit amet|123.564532|3|true|00:20:34\nconsectetur adipiscing elit|NULL|2|false|06:51:20\nsed do eiusmod tempor|-556132.25|1|NULL|23:46:03\n" + "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit|NULL|2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1|NULL|11:46:03 PM\n" .to_string(), String::from_utf8(buffer).unwrap() ); diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 0c2c4a71e221..83a279ce4794 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -57,8 +57,8 @@ fn test_export_csv_timestamps() { drop(writer); let left = "c1,c2 -2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378000000 -2021-10-30T17:59:07+11:00,2021-10-30T06:59:07.000000000\n"; +2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 +2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } From 9a3ceefce95246bd4c6f925bd825c23d77cfa1bf Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Sat, 21 Jan 2023 12:28:08 -0800 Subject: [PATCH 11/22] Add tests to check rfc3339 --- arrow-csv/src/writer.rs | 53 +++++++++++++++++++++++++++++++++++++++++ arrow/tests/csv.rs | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 80bd69b67ca2..c2eddf30b36b 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -573,4 +573,57 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo String::from_utf8(buffer).unwrap() ); } + + #[test] + fn test_write_csv_using_rfc3339() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".to_string())), + true, + ), + Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), + Field::new("c3", DataType::Date32, false), + Field::new("c4", DataType::Time32(TimeUnit::Second), false), + ]); + + let c1 = TimestampMillisecondArray::from(vec![ + Some(1555584887378), + Some(1635577147000), + ]) + .with_timezone("+00:00".to_string()); + let c2 = TimestampMillisecondArray::from(vec![ + Some(1555584887378), + Some(1635577147000), + ]); + let c3 = Date32Array::from(vec![3, 2]); + let c4 = Time32SecondArray::from(vec![1234, 24680]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)], + ) + .unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + + let builder = WriterBuilder::new().with_rfc3339(true); + let mut writer = builder.build(&mut file); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + file.rewind().unwrap(); + let mut buffer: Vec = vec![]; + file.read_to_end(&mut buffer).unwrap(); + + assert_eq!( + "c1,c2,c3,c4 +2019-04-18T10:54:47.378Z,2019-04-18T10:54:47.378,1970-01-04,00:20:34 +2021-10-30T06:59:07Z,2021-10-30T06:59:07,1970-01-03,06:51:20\n", + String::from_utf8(buffer).unwrap() + ); + } } diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 83a279ce4794..5a7c7e962a11 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -62,3 +62,48 @@ fn test_export_csv_timestamps() { let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } + +#[test] +fn test_export_csv_timestamps_using_rfc3339() { + let schema = Schema::new(vec![ + Field::new( + "c1", + DataType::Timestamp( + TimeUnit::Millisecond, + Some("Australia/Sydney".to_string()), + ), + true, + ), + Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), + ]); + + let c1 = TimestampMillisecondArray::from( + // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). + // The offset (difference to UTC) is +10:00. + // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) + // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. + // + vec![Some(1555584887378), Some(1635577147000)], + ) + .with_timezone("Australia/Sydney".to_string()); + let c2 = + TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); + + let mut sw = Vec::new(); + let mut writer = arrow_csv::WriterBuilder::new() + .with_rfc3339(true) + .build(&mut sw); + let batches = vec![&batch]; + for batch in batches { + writer.write(batch).unwrap(); + } + drop(writer); + + let left = "c1,c2 +2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 +2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; + let right = String::from_utf8(sw).unwrap(); + assert_eq!(left, right); +} From f3f3eba49989e7c66d1ccd8778543a78541b5ec0 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Sat, 21 Jan 2023 12:43:51 -0800 Subject: [PATCH 12/22] Add back test_conversion_consistency --- arrow-cast/src/display.rs | 2 +- arrow-csv/src/writer.rs | 69 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 14fa29c14051..686662f4af6f 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -149,7 +149,7 @@ macro_rules! make_string_date_with_format { let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); Ok(array - .value_as_date($row) + .value_as_datetime($row) .map(|d| d.format($format).to_string()) .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) }}; diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index c2eddf30b36b..91971cc6761e 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -433,7 +433,8 @@ impl WriterBuilder { mod tests { use super::*; - use std::io::{Read, Seek}; + use crate::Reader; + use std::io::{Cursor, Read, Seek}; use std::sync::Arc; #[test] @@ -574,6 +575,72 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ); } + #[test] + fn test_conversion_consistency() { + // test if we can serialize and deserialize whilst retaining the same type information/ precision + + let schema = Schema::new(vec![ + Field::new("c1", DataType::Date32, false), + Field::new("c2", DataType::Date64, false), + Field::new("c3", DataType::Timestamp(TimeUnit::Nanosecond, None), false), + ]); + + let nanoseconds = vec![ + 1599566300000000000, + 1599566200000000000, + 1599566100000000000, + ]; + let c1 = Date32Array::from(vec![3, 2, 1]); + let c2 = Date64Array::from(vec![3, 2, 1]); + let c3 = TimestampNanosecondArray::from(nanoseconds.clone()); + + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(c1), Arc::new(c2), Arc::new(c3)], + ) + .unwrap(); + + let builder = WriterBuilder::new().has_headers(false); + + let mut buf: Cursor> = Default::default(); + // drop the writer early to release the borrow. + { + let mut writer = builder.build(&mut buf); + writer.write(&batch).unwrap(); + } + buf.set_position(0); + + let mut reader = Reader::new( + buf, + Arc::new(schema), + false, + None, + 3, + // starting at row 2 and up to row 6. + None, + None, + None, + ); + let rb = reader.next().unwrap().unwrap(); + let c1 = rb.column(0).as_any().downcast_ref::().unwrap(); + let c2 = rb.column(1).as_any().downcast_ref::().unwrap(); + let c3 = rb + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + let actual = c1.into_iter().collect::>(); + let expected = vec![Some(3), Some(2), Some(1)]; + assert_eq!(actual, expected); + let actual = c2.into_iter().collect::>(); + let expected = vec![Some(3), Some(2), Some(1)]; + assert_eq!(actual, expected); + let actual = c3.into_iter().collect::>(); + let expected = nanoseconds.into_iter().map(Some).collect::>(); + assert_eq!(actual, expected); + } + #[test] fn test_write_csv_using_rfc3339() { let schema = Schema::new(vec![ From d476c27052b94be7a5377f75908acf2dd636ccf3 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Wed, 25 Jan 2023 12:42:30 -0800 Subject: [PATCH 13/22] Fix clippy errors --- arrow-csv/src/writer.rs | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 91971cc6761e..5ac69b5ec4c2 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -88,25 +88,6 @@ where lexical_to_string(c.value(i)) } -fn invalid_cast_error(dt: &str, col_index: usize, row_index: usize) -> ArrowError { - ArrowError::CastError(format!( - "Cannot cast to {dt} at col index: {col_index} row index: {row_index}" - )) -} - -macro_rules! write_temporal_value { - ($array:expr, $tpe: ident, $format: expr, $col_index: expr, $row_index: expr, $cast_func: ident, $tpe_name: expr) => {{ - $array - .as_any() - .downcast_ref::<$tpe>() - .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? - .$cast_func($row_index) - .ok_or_else(|| invalid_cast_error($tpe_name, $col_index, $row_index))? - .format($format) - .to_string() - }}; -} - /// A CSV writer #[derive(Debug)] pub struct Writer { From 83e897de07fee71cfdcad6666348d082ac7f8be1 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Wed, 25 Jan 2023 13:18:59 -0800 Subject: [PATCH 14/22] Minor linting issue --- arrow-cast/src/display.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 686662f4af6f..629582c21f96 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -178,7 +178,7 @@ macro_rules! make_string_time { } macro_rules! make_string_time_with_format { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); Ok(array .value_as_time($row) From 5d7a603cb02ef661f4cf1859d41e35cbd66936ee Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Wed, 25 Jan 2023 13:36:40 -0800 Subject: [PATCH 15/22] Separate array_value_to_string and datetime_array_value_to_string --- arrow-cast/src/display.rs | 184 +++++++++++++++++--------------------- 1 file changed, 82 insertions(+), 102 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 629582c21f96..f1cdf0a26ff8 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -402,10 +402,9 @@ fn append_map_field_string( /// /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. -fn array_value_to_string_internal( +pub fn array_value_to_string( column: &ArrayRef, row: usize, - datetime_format_opt: Option<&str>, ) -> Result { if column.is_null(row) { return Ok("".to_string()); @@ -431,91 +430,6 @@ fn array_value_to_string_internal( DataType::Float32 => make_string!(array::Float32Array, column, row), DataType::Float64 => make_string!(array::Float64Array, column, row), DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { - handle_string_datetime!( - array::TimestampSecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { - handle_string_datetime!( - array::TimestampMillisecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { - handle_string_datetime!( - array::TimestampMicrosecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ) - } - DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { - handle_string_datetime!( - array::TimestampNanosecondArray, - datetime_format_opt, - tz_string_opt, - column, - row - ) - } - DataType::Date32 => { - handle_string_date!(array::Date32Array, datetime_format_opt, column, row) - } - DataType::Date64 => { - handle_string_date!(array::Date64Array, datetime_format_opt, column, row) - } - DataType::Time32(unit) if *unit == TimeUnit::Second => { - handle_string_time!( - array::Time32SecondArray, - datetime_format_opt, - column, - row - ) - } - DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - handle_string_time!( - array::Time32MillisecondArray, - datetime_format_opt, - column, - row - ) - } - DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - handle_string_time!( - array::Time64MicrosecondArray, - datetime_format_opt, - column, - row - ) - } - DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - handle_string_time!( - array::Time64NanosecondArray, - datetime_format_opt, - column, - row - ) - } - DataType::Interval(unit) => match unit { - IntervalUnit::DayTime => { - make_string_interval_day_time!(column, row) - } - IntervalUnit::YearMonth => { - make_string_interval_year_month!(column, row) - } - IntervalUnit::MonthDayNano => { - make_string_interval_month_day_nano!(column, row) - } - }, DataType::List(_) => make_string_from_list!(column, row), DataType::LargeList(_) => make_string_from_large_list!(column, row), DataType::Dictionary(index_type, _value_type) => match **index_type { @@ -593,6 +507,87 @@ fn array_value_to_string_internal( DataType::Union(field_vec, type_ids, mode) => { union_to_string(column, row, field_vec, type_ids, mode) } + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not implemented for {:?} type", + column.data_type() + ))), + } +} + +pub fn datetime_array_value_to_string( + column: &ArrayRef, + row: usize, + format: Option<&str>, +) -> Result { + if column.is_null(row) { + return Ok("".to_string()); + } + match column.data_type() { + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { + handle_string_datetime!( + array::TimestampSecondArray, + format, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { + handle_string_datetime!( + array::TimestampMillisecondArray, + format, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { + handle_string_datetime!( + array::TimestampMicrosecondArray, + format, + tz_string_opt, + column, + row + ) + } + DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { + handle_string_datetime!( + array::TimestampNanosecondArray, + format, + tz_string_opt, + column, + row + ) + } + DataType::Date32 => { + handle_string_date!(array::Date32Array, format, column, row) + } + DataType::Date64 => { + handle_string_date!(array::Date64Array, format, column, row) + } + DataType::Time32(unit) if *unit == TimeUnit::Second => { + handle_string_time!(array::Time32SecondArray, format, column, row) + } + DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { + handle_string_time!(array::Time32MillisecondArray, format, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { + handle_string_time!(array::Time64MicrosecondArray, format, column, row) + } + DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { + handle_string_time!(array::Time64NanosecondArray, format, column, row) + } + DataType::Interval(unit) => match unit { + IntervalUnit::DayTime => { + make_string_interval_day_time!(column, row) + } + IntervalUnit::YearMonth => { + make_string_interval_year_month!(column, row) + } + IntervalUnit::MonthDayNano => { + make_string_interval_month_day_nano!(column, row) + } + }, DataType::Duration(unit) => match *unit { TimeUnit::Second => { make_string_from_duration!(array::DurationSecondArray, column, row) @@ -614,21 +609,6 @@ fn array_value_to_string_internal( } } -pub fn array_value_to_string( - column: &ArrayRef, - row: usize, -) -> Result { - array_value_to_string_internal(column, row, None) -} - -pub fn datetime_array_value_to_string( - column: &ArrayRef, - row: usize, - format: Option<&str>, -) -> Result { - array_value_to_string_internal(column, row, format) -} - /// Converts the value of the union array at `row` to a String fn union_to_string( column: &ArrayRef, From cab203014d71d4045bb668f1c94c163dd67ac256 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 01:21:46 -0800 Subject: [PATCH 16/22] Add back invalid cast test --- arrow-cast/src/display.rs | 268 +++++++++++++++++++++++++++----------- arrow-csv/src/writer.rs | 64 +++++++-- arrow-json/src/writer.rs | 141 ++++---------------- 3 files changed, 268 insertions(+), 205 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index f1cdf0a26ff8..238239e9e343 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -30,6 +30,13 @@ use arrow_schema::*; use chrono::prelude::SecondsFormat; use chrono::{DateTime, Utc}; +fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {} at col index: {} row index: {}", + dt, col_idx, row_idx + )) +} + macro_rules! make_string { ($array_type:ty, $column: ident, $row: ident) => {{ let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); @@ -134,91 +141,118 @@ macro_rules! make_string_interval_month_day_nano { } macro_rules! make_string_date { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_date($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_date($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) }}; } macro_rules! make_string_date_with_format { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_datetime($row) - .map(|d| d.format($format).to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .format($format) + .to_string()) }}; } macro_rules! handle_string_date { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ match $format { Some(format) => { - make_string_date_with_format!($array_type, format, $column, $row) + make_string_date_with_format!( + $array_type, + $dt, + format, + $column, + $col_idx, + $row_idx + ) } - None => make_string_date!($array_type, $column, $row), + None => make_string_date!($array_type, $dt, $column, $col_idx, $row_idx), } }}; } macro_rules! make_string_time { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_time($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_time($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) }}; } macro_rules! make_string_time_with_format { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - Ok(array - .value_as_time($row) - .map(|d| d.format($format).to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_time($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .format($format) + .to_string()) }}; } macro_rules! handle_string_time { - ($array_type:ty, $format: ident, $column: ident, $row: ident) => { + ($array_type:ty, $dt:expr, $format: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { match $format { Some(format) => { - make_string_time_with_format!($array_type, format, $column, $row) + make_string_time_with_format!( + $array_type, + $dt, + format, + $column, + $col_idx, + $row_idx + ) } - None => make_string_time!($array_type, $column, $row), + None => make_string_time!($array_type, $dt, $column, $col_idx, $row_idx), } }; } macro_rules! make_string_datetime { - ($array_type:ty, $tz_string: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); + ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; let s = match $tz_string { Some(tz_string) => match tz_string.parse::() { Ok(tz) => array - .value_as_datetime_with_tz($row, tz) - .map(|d| { - format!("{}", d.to_rfc3339_opts(SecondsFormat::AutoSi, true)) - }) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), - Err(_) => array - .value_as_datetime($row) - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, tz_string)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + .value_as_datetime_with_tz($row_idx, tz) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_rfc3339_opts(SecondsFormat::AutoSi, true) + .to_string(), + Err(_) => { + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) + } }, - None => array - .value_as_datetime($row) - .map(|d| format!("{:?}", d)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + None => { + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + format!("{:?}", datetime) + } }; Ok(s) @@ -226,22 +260,26 @@ macro_rules! make_string_datetime { } macro_rules! make_string_datetime_with_format { - ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - let datetime = array.value_as_datetime($row); + ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + let datetime = array.value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; let s = match $tz_string { Some(tz_string) => match tz_string.parse::() { Ok(tz) => { - let utc_time = DateTime::::from_utc(datetime.unwrap(), Utc); + let utc_time = DateTime::::from_utc(datetime, Utc); let local_time = utc_time.with_timezone(&tz); local_time.format($format).to_string() } - Err(_) => datetime - .map(|d| format!("{:?} (Unknown Time Zone '{}')", d, tz_string)) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string()), + Err(_) => { + format!("{:?} (Unknown Time Zone '{}')", datetime, tz_string) + } }, - None => datetime.unwrap().format($format).to_string(), + None => datetime.format($format).to_string(), }; Ok(s) @@ -249,16 +287,25 @@ macro_rules! make_string_datetime_with_format { } macro_rules! handle_string_datetime { - ($array_type:ty, $format: ident, $tz_string: ident, $column: ident, $row: ident) => { + ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => { match $format { Some(format) => make_string_datetime_with_format!( $array_type, + $dt, format, $tz_string, $column, - $row + $col_idx, + $row_idx + ), + None => make_string_datetime!( + $array_type, + $dt, + $tz_string, + $column, + $col_idx, + $row_idx ), - None => make_string_datetime!($array_type, $tz_string, $column, $row), } }; } @@ -514,92 +561,155 @@ pub fn array_value_to_string( } } -pub fn datetime_array_value_to_string( +pub fn temporal_array_value_to_string( column: &ArrayRef, - row: usize, + col_idx: usize, + row_idx: usize, format: Option<&str>, ) -> Result { - if column.is_null(row) { + if column.is_null(row_idx) { return Ok("".to_string()); } match column.data_type() { DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { handle_string_datetime!( array::TimestampSecondArray, + "Timestamp", format, tz_string_opt, column, - row + col_idx, + row_idx ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Millisecond => { handle_string_datetime!( array::TimestampMillisecondArray, + "Timestamp", format, tz_string_opt, column, - row + col_idx, + row_idx ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Microsecond => { handle_string_datetime!( array::TimestampMicrosecondArray, + "Timestamp", format, tz_string_opt, column, - row + col_idx, + row_idx ) } DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Nanosecond => { handle_string_datetime!( array::TimestampNanosecondArray, + "Timestamp", format, tz_string_opt, column, - row + col_idx, + row_idx ) } DataType::Date32 => { - handle_string_date!(array::Date32Array, format, column, row) + handle_string_date!( + array::Date32Array, + "Date32", + format, + column, + col_idx, + row_idx + ) } DataType::Date64 => { - handle_string_date!(array::Date64Array, format, column, row) + handle_string_date!( + array::Date64Array, + "Date64", + format, + column, + col_idx, + row_idx + ) } DataType::Time32(unit) if *unit == TimeUnit::Second => { - handle_string_time!(array::Time32SecondArray, format, column, row) + handle_string_time!( + array::Time32SecondArray, + "Time32", + format, + column, + col_idx, + row_idx + ) } DataType::Time32(unit) if *unit == TimeUnit::Millisecond => { - handle_string_time!(array::Time32MillisecondArray, format, column, row) + handle_string_time!( + array::Time32MillisecondArray, + "Time32", + format, + column, + col_idx, + row_idx + ) } DataType::Time64(unit) if *unit == TimeUnit::Microsecond => { - handle_string_time!(array::Time64MicrosecondArray, format, column, row) + handle_string_time!( + array::Time64MicrosecondArray, + "Time64", + format, + column, + col_idx, + row_idx + ) } DataType::Time64(unit) if *unit == TimeUnit::Nanosecond => { - handle_string_time!(array::Time64NanosecondArray, format, column, row) + handle_string_time!( + array::Time64NanosecondArray, + "Time64", + format, + column, + col_idx, + row_idx + ) } DataType::Interval(unit) => match unit { IntervalUnit::DayTime => { - make_string_interval_day_time!(column, row) + make_string_interval_day_time!(column, row_idx) } IntervalUnit::YearMonth => { - make_string_interval_year_month!(column, row) + make_string_interval_year_month!(column, row_idx) } IntervalUnit::MonthDayNano => { - make_string_interval_month_day_nano!(column, row) + make_string_interval_month_day_nano!(column, row_idx) } }, DataType::Duration(unit) => match *unit { TimeUnit::Second => { - make_string_from_duration!(array::DurationSecondArray, column, row) + make_string_from_duration!(array::DurationSecondArray, column, row_idx) } TimeUnit::Millisecond => { - make_string_from_duration!(array::DurationMillisecondArray, column, row) + make_string_from_duration!( + array::DurationMillisecondArray, + column, + row_idx + ) } TimeUnit::Microsecond => { - make_string_from_duration!(array::DurationMicrosecondArray, column, row) + make_string_from_duration!( + array::DurationMicrosecondArray, + column, + row_idx + ) } TimeUnit::Nanosecond => { - make_string_from_duration!(array::DurationNanosecondArray, column, row) + make_string_from_duration!( + array::DurationNanosecondArray, + column, + row_idx + ) } }, _ => Err(ArrowError::InvalidArgumentError(format!( diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 5ac69b5ec4c2..729f7bf25b88 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -66,7 +66,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_cast::display::{ - array_value_to_string, datetime_array_value_to_string, lexical_to_string, + array_value_to_string, lexical_to_string, temporal_array_value_to_string, }; use arrow_schema::*; use std::io::Write; @@ -88,6 +88,13 @@ where lexical_to_string(c.value(i)) } +fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {} at col index: {} row index: {}", + dt, col_idx, row_idx + )) +} + /// A CSV writer #[derive(Debug)] pub struct Writer { @@ -161,55 +168,63 @@ impl Writer { DataType::Boolean => array_value_to_string(col, row_index)?.to_string(), DataType::Utf8 => array_value_to_string(col, row_index)?.to_string(), DataType::LargeUtf8 => array_value_to_string(col, row_index)?.to_string(), - DataType::Date32 => datetime_array_value_to_string( + DataType::Date32 => temporal_array_value_to_string( col, + col_index, row_index, self.date_format.as_deref(), )? .to_string(), - DataType::Date64 => datetime_array_value_to_string( + DataType::Date64 => temporal_array_value_to_string( col, + col_index, row_index, self.datetime_format.as_deref(), )? .to_string(), - DataType::Time32(TimeUnit::Second) => datetime_array_value_to_string( + DataType::Time32(TimeUnit::Second) => temporal_array_value_to_string( col, + col_index, row_index, self.time_format.as_deref(), )? .to_string(), DataType::Time32(TimeUnit::Millisecond) => { - datetime_array_value_to_string( + temporal_array_value_to_string( col, + col_index, row_index, self.time_format.as_deref(), )? .to_string() } DataType::Time64(TimeUnit::Microsecond) => { - datetime_array_value_to_string( + temporal_array_value_to_string( col, + col_index, row_index, self.time_format.as_deref(), )? .to_string() } - DataType::Time64(TimeUnit::Nanosecond) => datetime_array_value_to_string( + DataType::Time64(TimeUnit::Nanosecond) => temporal_array_value_to_string( col, + col_index, row_index, self.time_format.as_deref(), )? .to_string(), DataType::Timestamp(_, time_zone) => match time_zone { - Some(_tz) => datetime_array_value_to_string( + Some(_tz) => temporal_array_value_to_string( col, + col_index, row_index, self.timestamp_tz_format.as_deref(), )? .to_string(), - None => datetime_array_value_to_string( + None => temporal_array_value_to_string( col, + col_index, row_index, self.timestamp_format.as_deref(), )? @@ -622,6 +637,37 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo assert_eq!(actual, expected); } + #[test] + fn test_write_csv_invalid_cast() { + let schema = Schema::new(vec![ + Field::new("c0", DataType::UInt32, false), + Field::new("c1", DataType::Date64, false), + ]); + + let c0 = UInt32Array::from(vec![Some(123), Some(234)]); + let c1 = Date64Array::from(vec![Some(1926632005177), Some(1926632005177685347)]); + let batch = + RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c0), Arc::new(c1)]) + .unwrap(); + + let mut file = tempfile::tempfile().unwrap(); + let mut writer = Writer::new(&mut file); + let batches = vec![&batch, &batch]; + + for batch in batches { + writer + .write(batch) + .map_err(|e| { + dbg!(e.to_string()); + assert!(e.to_string().ends_with( + invalid_cast_error("Date64", 1, 1).to_string().as_str() + )) + }) + .unwrap_err(); + } + drop(writer); + } + #[test] fn test_write_csv_using_rfc3339() { let schema = Schema::new(vec![ diff --git a/arrow-json/src/writer.rs b/arrow-json/src/writer.rs index 9d241aed3d28..fa7db4b862e9 100644 --- a/arrow-json/src/writer.rs +++ b/arrow-json/src/writer.rs @@ -105,7 +105,7 @@ use arrow_array::types::*; use arrow_array::*; use arrow_schema::*; -use arrow_cast::display::array_value_to_string; +use arrow_cast::display::temporal_array_value_to_string; fn primitive_array_to_json(array: &ArrayRef) -> Result, ArrowError> where @@ -137,6 +137,7 @@ fn struct_array_to_jsonmap_array( row_count, struct_col, inner_col_names[j], + j, )? } Ok(inner_objs) @@ -217,7 +218,7 @@ macro_rules! set_column_by_array_type { } macro_rules! set_temporal_column_by_array_type { - ($array_type:ident, $col_name:ident, $rows:ident, $array:ident, $row_count:ident, $cast_fn:ident) => { + ($col_name:ident, $col_idx:ident, $rows:ident, $array:ident, $row_count:ident) => { $rows .iter_mut() .enumerate() @@ -226,7 +227,10 @@ macro_rules! set_temporal_column_by_array_type { if !$array.is_null(i) { row.insert( $col_name.to_string(), - array_value_to_string($array, i).unwrap().to_string().into(), + temporal_array_value_to_string($array, $col_idx, i, None) + .unwrap() + .to_string() + .into(), ); } }); @@ -260,6 +264,7 @@ fn set_column_for_json_rows( row_count: usize, array: &ArrayRef, col_name: &str, + col_idx: usize, ) -> Result<(), ArrowError> { match array.data_type() { DataType::Int8 => { @@ -311,144 +316,46 @@ fn set_column_for_json_rows( ); } DataType::Date32 => { - set_temporal_column_by_array_type!( - Date32Array, - col_name, - rows, - array, - row_count, - value_as_date - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Date64 => { - set_temporal_column_by_array_type!( - Date64Array, - col_name, - rows, - array, - row_count, - value_as_date - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Second, _) => { - set_temporal_column_by_array_type!( - TimestampSecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Millisecond, _) => { - set_temporal_column_by_array_type!( - TimestampMillisecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Microsecond, _) => { - set_temporal_column_by_array_type!( - TimestampMicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Timestamp(TimeUnit::Nanosecond, _) => { - set_temporal_column_by_array_type!( - TimestampNanosecondArray, - col_name, - rows, - array, - row_count, - value_as_datetime - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time32(TimeUnit::Second) => { - set_temporal_column_by_array_type!( - Time32SecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time32(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!( - Time32MillisecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time64(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!( - Time64MicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Time64(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!( - Time64NanosecondArray, - col_name, - rows, - array, - row_count, - value_as_time - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Second) => { - set_temporal_column_by_array_type!( - DurationSecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Millisecond) => { - set_temporal_column_by_array_type!( - DurationMillisecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Microsecond) => { - set_temporal_column_by_array_type!( - DurationMicrosecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Duration(TimeUnit::Nanosecond) => { - set_temporal_column_by_array_type!( - DurationNanosecondArray, - col_name, - rows, - array, - row_count, - value_as_duration - ); + set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count); } DataType::Struct(_) => { let inner_objs = @@ -492,7 +399,7 @@ fn set_column_for_json_rows( let slice = array.slice(0, row_count); let hydrated = arrow_cast::cast::cast(&slice, value_type) .expect("cannot cast dictionary to underlying values"); - set_column_for_json_rows(rows, row_count, &hydrated, col_name)?; + set_column_for_json_rows(rows, row_count, &hydrated, col_name, col_idx)?; } DataType::Map(_, _) => { let maparr = as_map_array(array); @@ -558,7 +465,7 @@ pub fn record_batches_to_json_rows( let row_count = batch.num_rows(); for (j, col) in batch.columns().iter().enumerate() { let col_name = schema.field(j).name(); - set_column_for_json_rows(&mut rows[base..], row_count, col, col_name)? + set_column_for_json_rows(&mut rows[base..], row_count, col, col_name, j)? } base += row_count; } From 286bc525bf54be06e767e4933f55192c076f9567 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 01:28:31 -0800 Subject: [PATCH 17/22] Fix linting and clippy errors --- arrow-cast/src/display.rs | 21 +++++++++++---------- arrow-csv/src/writer.rs | 14 +++++++------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 238239e9e343..75eef25a29ef 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -229,9 +229,9 @@ macro_rules! handle_string_time { macro_rules! make_string_datetime { ($array_type:ty, $dt:expr, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ let array = $column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; let s = match $tz_string { Some(tz_string) => match tz_string.parse::() { @@ -249,8 +249,8 @@ macro_rules! make_string_datetime { }, None => { let datetime = array - .value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; format!("{:?}", datetime) } }; @@ -262,11 +262,12 @@ macro_rules! make_string_datetime { macro_rules! make_string_datetime_with_format { ($array_type:ty, $dt:expr, $format: ident, $tz_string: ident, $column: ident, $col_idx:ident, $row_idx: ident) => {{ let array = $column - .as_any() - .downcast_ref::<$array_type>() - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; - let datetime = array.value_as_datetime($row_idx) - .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; + let datetime = array + .value_as_datetime($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))?; let s = match $tz_string { Some(tz_string) => match tz_string.parse::() { diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 729f7bf25b88..ab389751ff22 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -88,13 +88,6 @@ where lexical_to_string(c.value(i)) } -fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { - ArrowError::CastError(format!( - "Cannot cast to {} at col index: {} row index: {}", - dt, col_idx, row_idx - )) -} - /// A CSV writer #[derive(Debug)] pub struct Writer { @@ -433,6 +426,13 @@ mod tests { use std::io::{Cursor, Read, Seek}; use std::sync::Arc; + fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { + ArrowError::CastError(format!( + "Cannot cast to {} at col index: {} row index: {}", + dt, col_idx, row_idx + )) + } + #[test] fn test_write_csv() { let schema = Schema::new(vec![ From 8b6e856579eae7296e0a03d0bd2901d15470387b Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 01:36:34 -0800 Subject: [PATCH 18/22] Fix arrow-cast test --- arrow-cast/src/display.rs | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 75eef25a29ef..fdef4f7a373b 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -815,28 +815,49 @@ mod tests { } #[test] - fn test_array_value_to_string_duration() { + fn test_temporal_array_value_to_string_duration() { let ns_array = Arc::new(DurationNanosecondArray::from(vec![Some(1), None])) as ArrayRef; assert_eq!( - array_value_to_string(&ns_array, 0).unwrap(), + temporal_array_value_to_string(&ns_array, 0, 0, None).unwrap(), "PT0.000000001S" ); - assert_eq!(array_value_to_string(&ns_array, 1).unwrap(), ""); + assert_eq!( + temporal_array_value_to_string(&ns_array, 0, 1, None).unwrap(), + "" + ); let us_array = Arc::new(DurationMicrosecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!(array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S"); - assert_eq!(array_value_to_string(&us_array, 1).unwrap(), ""); + assert_eq!( + temporal_array_value_to_string(&us_array, 0, 0, None).unwrap(), + "PT0.000001S" + ); + assert_eq!( + temporal_array_value_to_string(&us_array, 0, 1, None).unwrap(), + "" + ); let ms_array = Arc::new(DurationMillisecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!(array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S"); - assert_eq!(array_value_to_string(&ms_array, 1).unwrap(), ""); + assert_eq!( + temporal_array_value_to_string(&ms_array, 0, 0, None).unwrap(), + "PT0.001S" + ); + assert_eq!( + temporal_array_value_to_string(&ms_array, 0, 1, None).unwrap(), + "" + ); let s_array = Arc::new(DurationSecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!(array_value_to_string(&s_array, 0).unwrap(), "PT1S"); - assert_eq!(array_value_to_string(&s_array, 1).unwrap(), ""); + assert_eq!( + temporal_array_value_to_string(&s_array, 0, 0, None).unwrap(), + "PT1S" + ); + assert_eq!( + temporal_array_value_to_string(&s_array, 0, 1, None).unwrap(), + "" + ); } } From c50104aa34516590961048f1fcd2e2dcb960cdcb Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 10:52:46 -0800 Subject: [PATCH 19/22] Restructuring --- arrow-cast/src/display.rs | 244 +++++++++++++++++++------------------- 1 file changed, 121 insertions(+), 123 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index fdef4f7a373b..8515d5887692 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -450,128 +450,36 @@ fn append_map_field_string( /// /// Note this function is quite inefficient and is unlikely to be /// suitable for converting large arrays or record batches. -pub fn array_value_to_string( - column: &ArrayRef, - row: usize, -) -> Result { - if column.is_null(row) { - return Ok("".to_string()); - } - match column.data_type() { - DataType::Utf8 => make_string!(array::StringArray, column, row), - DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row), - DataType::Binary => make_string_hex!(array::BinaryArray, column, row), - DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row), - DataType::FixedSizeBinary(_) => { - make_string_hex!(array::FixedSizeBinaryArray, column, row) - } - DataType::Boolean => make_string!(array::BooleanArray, column, row), - DataType::Int8 => make_string!(array::Int8Array, column, row), - DataType::Int16 => make_string!(array::Int16Array, column, row), - DataType::Int32 => make_string!(array::Int32Array, column, row), - DataType::Int64 => make_string!(array::Int64Array, column, row), - DataType::UInt8 => make_string!(array::UInt8Array, column, row), - DataType::UInt16 => make_string!(array::UInt16Array, column, row), - DataType::UInt32 => make_string!(array::UInt32Array, column, row), - DataType::UInt64 => make_string!(array::UInt64Array, column, row), - DataType::Float16 => make_string!(array::Float16Array, column, row), - DataType::Float32 => make_string!(array::Float32Array, column, row), - DataType::Float64 => make_string!(array::Float64Array, column, row), - DataType::Decimal128(..) => make_string_from_decimal(column, row), - DataType::List(_) => make_string_from_list!(column, row), - DataType::LargeList(_) => make_string_from_large_list!(column, row), - DataType::Dictionary(index_type, _value_type) => match **index_type { - DataType::Int8 => dict_array_value_to_string::(column, row), - DataType::Int16 => dict_array_value_to_string::(column, row), - DataType::Int32 => dict_array_value_to_string::(column, row), - DataType::Int64 => dict_array_value_to_string::(column, row), - DataType::UInt8 => dict_array_value_to_string::(column, row), - DataType::UInt16 => dict_array_value_to_string::(column, row), - DataType::UInt32 => dict_array_value_to_string::(column, row), - DataType::UInt64 => dict_array_value_to_string::(column, row), - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not supported for {:?} due to index type", - column.data_type() - ))), - }, - DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row), - DataType::Struct(_) => { - let st = column - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert struct column to struct array." - .to_string(), - ) - })?; - - let mut s = String::new(); - s.push('{'); - let mut kv_iter = st.columns().iter().zip(st.column_names()); - if let Some((col, name)) = kv_iter.next() { - append_struct_field_string(&mut s, name, col, row)?; - } - for (col, name) in kv_iter { - s.push_str(", "); - append_struct_field_string(&mut s, name, col, row)?; - } - s.push('}'); - - Ok(s) - } - DataType::Map(_, _) => { - let map_array = - column.as_any().downcast_ref::().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert column to map array.".to_string(), - ) - })?; - let map_entry = map_array.value(row); - let st = map_entry - .as_any() - .downcast_ref::() - .ok_or_else(|| { - ArrowError::InvalidArgumentError( - "Repl error: could not convert map entry to struct array." - .to_string(), - ) - })?; - let mut s = String::new(); - s.push('{'); - let entries_count = st.column(0).len(); - for i in 0..entries_count { - if i > 0 { - s.push_str(", "); - } - append_map_field_string(&mut s, st.column(0), i)?; - s.push_str(": "); - append_map_field_string(&mut s, st.column(1), i)?; - } - s.push('}'); - - Ok(s) - } - DataType::Union(field_vec, type_ids, mode) => { - union_to_string(column, row, field_vec, type_ids, mode) - } - _ => Err(ArrowError::InvalidArgumentError(format!( - "Pretty printing not implemented for {:?} type", - column.data_type() - ))), - } -} - -pub fn temporal_array_value_to_string( +fn array_value_to_string_internal( column: &ArrayRef, col_idx: usize, row_idx: usize, - format: Option<&str>, + format: Option<&str> ) -> Result { if column.is_null(row_idx) { return Ok("".to_string()); } match column.data_type() { + DataType::Utf8 => make_string!(array::StringArray, column, row_idx), + DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row_idx), + DataType::Binary => make_string_hex!(array::BinaryArray, column, row_idx), + DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row_idx), + DataType::FixedSizeBinary(_) => { + make_string_hex!(array::FixedSizeBinaryArray, column, row_idx) + } + DataType::Boolean => make_string!(array::BooleanArray, column, row_idx), + DataType::Int8 => make_string!(array::Int8Array, column, row_idx), + DataType::Int16 => make_string!(array::Int16Array, column, row_idx), + DataType::Int32 => make_string!(array::Int32Array, column, row_idx), + DataType::Int64 => make_string!(array::Int64Array, column, row_idx), + DataType::UInt8 => make_string!(array::UInt8Array, column, row_idx), + DataType::UInt16 => make_string!(array::UInt16Array, column, row_idx), + DataType::UInt32 => make_string!(array::UInt32Array, column, row_idx), + DataType::UInt64 => make_string!(array::UInt64Array, column, row_idx), + DataType::Float16 => make_string!(array::Float16Array, column, row_idx), + DataType::Float32 => make_string!(array::Float32Array, column, row_idx), + DataType::Float64 => make_string!(array::Float64Array, column, row_idx), + DataType::Decimal128(..) => make_string_from_decimal(column, row_idx), DataType::Timestamp(unit, tz_string_opt) if *unit == TimeUnit::Second => { handle_string_datetime!( array::TimestampSecondArray, @@ -687,6 +595,83 @@ pub fn temporal_array_value_to_string( make_string_interval_month_day_nano!(column, row_idx) } }, + DataType::List(_) => make_string_from_list!(column, row_idx), + DataType::LargeList(_) => make_string_from_large_list!(column, row_idx), + DataType::Dictionary(index_type, _value_type) => match **index_type { + DataType::Int8 => dict_array_value_to_string::(column, row_idx), + DataType::Int16 => dict_array_value_to_string::(column, row_idx), + DataType::Int32 => dict_array_value_to_string::(column, row_idx), + DataType::Int64 => dict_array_value_to_string::(column, row_idx), + DataType::UInt8 => dict_array_value_to_string::(column, row_idx), + DataType::UInt16 => dict_array_value_to_string::(column, row_idx), + DataType::UInt32 => dict_array_value_to_string::(column, row_idx), + DataType::UInt64 => dict_array_value_to_string::(column, row_idx), + _ => Err(ArrowError::InvalidArgumentError(format!( + "Pretty printing not supported for {:?} due to index type", + column.data_type() + ))), + }, + DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row_idx), + DataType::Struct(_) => { + let st = column + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert struct column to struct array." + .to_string(), + ) + })?; + + let mut s = String::new(); + s.push('{'); + let mut kv_iter = st.columns().iter().zip(st.column_names()); + if let Some((col, name)) = kv_iter.next() { + append_struct_field_string(&mut s, name, col, row_idx)?; + } + for (col, name) in kv_iter { + s.push_str(", "); + append_struct_field_string(&mut s, name, col, row_idx)?; + } + s.push('}'); + + Ok(s) + } + DataType::Map(_, _) => { + let map_array = + column.as_any().downcast_ref::().ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert column to map array.".to_string(), + ) + })?; + let map_entry = map_array.value(row_idx); + let st = map_entry + .as_any() + .downcast_ref::() + .ok_or_else(|| { + ArrowError::InvalidArgumentError( + "Repl error: could not convert map entry to struct array." + .to_string(), + ) + })?; + let mut s = String::new(); + s.push('{'); + let entries_count = st.column(0).len(); + for i in 0..entries_count { + if i > 0 { + s.push_str(", "); + } + append_map_field_string(&mut s, st.column(0), i)?; + s.push_str(": "); + append_map_field_string(&mut s, st.column(1), i)?; + } + s.push('}'); + + Ok(s) + } + DataType::Union(field_vec, type_ids, mode) => { + union_to_string(column, row_idx, field_vec, type_ids, mode) + } DataType::Duration(unit) => match *unit { TimeUnit::Second => { make_string_from_duration!(array::DurationSecondArray, column, row_idx) @@ -720,6 +705,19 @@ pub fn temporal_array_value_to_string( } } +pub fn temporal_array_value_to_string( + column: &ArrayRef, + col_idx: usize, + row_idx: usize, + format: Option<&str>, +) -> Result { + array_value_to_string_internal(column, col_idx, row_idx, format) +} + +pub fn array_value_to_string(column: &ArrayRef, row_idx: usize) -> Result { + array_value_to_string_internal(column, 0, row_idx, None) +} + /// Converts the value of the union array at `row` to a String fn union_to_string( column: &ArrayRef, @@ -815,48 +813,48 @@ mod tests { } #[test] - fn test_temporal_array_value_to_string_duration() { + fn test_array_value_to_string_duration() { let ns_array = Arc::new(DurationNanosecondArray::from(vec![Some(1), None])) as ArrayRef; assert_eq!( - temporal_array_value_to_string(&ns_array, 0, 0, None).unwrap(), + array_value_to_string(&ns_array, 0).unwrap(), "PT0.000000001S" ); assert_eq!( - temporal_array_value_to_string(&ns_array, 0, 1, None).unwrap(), + array_value_to_string(&ns_array, 1).unwrap(), "" ); let us_array = Arc::new(DurationMicrosecondArray::from(vec![Some(1), None])) as ArrayRef; assert_eq!( - temporal_array_value_to_string(&us_array, 0, 0, None).unwrap(), + array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S" ); assert_eq!( - temporal_array_value_to_string(&us_array, 0, 1, None).unwrap(), + array_value_to_string(&us_array, 1).unwrap(), "" ); let ms_array = Arc::new(DurationMillisecondArray::from(vec![Some(1), None])) as ArrayRef; assert_eq!( - temporal_array_value_to_string(&ms_array, 0, 0, None).unwrap(), + array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S" ); assert_eq!( - temporal_array_value_to_string(&ms_array, 0, 1, None).unwrap(), + array_value_to_string(&ms_array, 1).unwrap(), "" ); let s_array = Arc::new(DurationSecondArray::from(vec![Some(1), None])) as ArrayRef; assert_eq!( - temporal_array_value_to_string(&s_array, 0, 0, None).unwrap(), + array_value_to_string(&s_array, 0).unwrap(), "PT1S" ); assert_eq!( - temporal_array_value_to_string(&s_array, 0, 1, None).unwrap(), + array_value_to_string(&s_array, 1).unwrap(), "" ); } From f75855c4cd5687bc0422f2c11c31eebdae7fc37b Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 10:53:35 -0800 Subject: [PATCH 20/22] Fix formatting errors --- arrow-cast/src/display.rs | 50 ++++++++++++++------------------------- 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index 8515d5887692..b909a3d358ca 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -454,7 +454,7 @@ fn array_value_to_string_internal( column: &ArrayRef, col_idx: usize, row_idx: usize, - format: Option<&str> + format: Option<&str>, ) -> Result { if column.is_null(row_idx) { return Ok("".to_string()); @@ -463,7 +463,9 @@ fn array_value_to_string_internal( DataType::Utf8 => make_string!(array::StringArray, column, row_idx), DataType::LargeUtf8 => make_string!(array::LargeStringArray, column, row_idx), DataType::Binary => make_string_hex!(array::BinaryArray, column, row_idx), - DataType::LargeBinary => make_string_hex!(array::LargeBinaryArray, column, row_idx), + DataType::LargeBinary => { + make_string_hex!(array::LargeBinaryArray, column, row_idx) + } DataType::FixedSizeBinary(_) => { make_string_hex!(array::FixedSizeBinaryArray, column, row_idx) } @@ -611,7 +613,9 @@ fn array_value_to_string_internal( column.data_type() ))), }, - DataType::FixedSizeList(_, _) => make_string_from_fixed_size_list!(column, row_idx), + DataType::FixedSizeList(_, _) => { + make_string_from_fixed_size_list!(column, row_idx) + } DataType::Struct(_) => { let st = column .as_any() @@ -714,7 +718,10 @@ pub fn temporal_array_value_to_string( array_value_to_string_internal(column, col_idx, row_idx, format) } -pub fn array_value_to_string(column: &ArrayRef, row_idx: usize) -> Result { +pub fn array_value_to_string( + column: &ArrayRef, + row_idx: usize, +) -> Result { array_value_to_string_internal(column, 0, row_idx, None) } @@ -820,42 +827,21 @@ mod tests { array_value_to_string(&ns_array, 0).unwrap(), "PT0.000000001S" ); - assert_eq!( - array_value_to_string(&ns_array, 1).unwrap(), - "" - ); + assert_eq!(array_value_to_string(&ns_array, 1).unwrap(), ""); let us_array = Arc::new(DurationMicrosecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!( - array_value_to_string(&us_array, 0).unwrap(), - "PT0.000001S" - ); - assert_eq!( - array_value_to_string(&us_array, 1).unwrap(), - "" - ); + assert_eq!(array_value_to_string(&us_array, 0).unwrap(), "PT0.000001S"); + assert_eq!(array_value_to_string(&us_array, 1).unwrap(), ""); let ms_array = Arc::new(DurationMillisecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!( - array_value_to_string(&ms_array, 0).unwrap(), - "PT0.001S" - ); - assert_eq!( - array_value_to_string(&ms_array, 1).unwrap(), - "" - ); + assert_eq!(array_value_to_string(&ms_array, 0).unwrap(), "PT0.001S"); + assert_eq!(array_value_to_string(&ms_array, 1).unwrap(), ""); let s_array = Arc::new(DurationSecondArray::from(vec![Some(1), None])) as ArrayRef; - assert_eq!( - array_value_to_string(&s_array, 0).unwrap(), - "PT1S" - ); - assert_eq!( - array_value_to_string(&s_array, 1).unwrap(), - "" - ); + assert_eq!(array_value_to_string(&s_array, 0).unwrap(), "PT1S"); + assert_eq!(array_value_to_string(&s_array, 1).unwrap(), ""); } } From e52940ae155317f6ee4e13910ac748fd33247728 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 13:06:19 -0800 Subject: [PATCH 21/22] Change make_duration_string to use invalid_cast_error --- arrow-cast/src/display.rs | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index b909a3d358ca..efcec71da9a9 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -375,13 +375,14 @@ macro_rules! make_string_from_fixed_size_list { } macro_rules! make_string_from_duration { - ($array_type:ty, $column: ident, $row: ident) => {{ - let array = $column.as_any().downcast_ref::<$array_type>().unwrap(); - - Ok(array - .value_as_duration($row) - .map(|d| d.to_string()) - .unwrap_or_else(|| "ERROR CONVERTING DATE".to_string())) + ($array_type:ty, $dt:expr, $column:ident, $col_idx:ident, $row_idx: ident) => {{ + Ok($column + .as_any() + .downcast_ref::<$array_type>() + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .value_as_duration($row_idx) + .ok_or_else(|| invalid_cast_error($dt, $col_idx, $row_idx))? + .to_string()) }}; } @@ -678,26 +679,38 @@ fn array_value_to_string_internal( } DataType::Duration(unit) => match *unit { TimeUnit::Second => { - make_string_from_duration!(array::DurationSecondArray, column, row_idx) + make_string_from_duration!( + array::DurationSecondArray, + "Duration", + column, + col_idx, + row_idx + ) } TimeUnit::Millisecond => { make_string_from_duration!( array::DurationMillisecondArray, + "Duration", column, + col_idx, row_idx ) } TimeUnit::Microsecond => { make_string_from_duration!( array::DurationMicrosecondArray, + "Duration", column, + col_idx, row_idx ) } TimeUnit::Nanosecond => { make_string_from_duration!( array::DurationNanosecondArray, + "Duration", column, + col_idx, row_idx ) } From 98251e190981ee3434de959eee91dec4f90b60f7 Mon Sep 17 00:00:00 2001 From: Jayjeet Chakraborty Date: Thu, 26 Jan 2023 22:32:00 -0800 Subject: [PATCH 22/22] Fix clippy errors --- arrow-cast/src/display.rs | 3 +-- arrow-csv/src/writer.rs | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arrow-cast/src/display.rs b/arrow-cast/src/display.rs index efcec71da9a9..7214321127cf 100644 --- a/arrow-cast/src/display.rs +++ b/arrow-cast/src/display.rs @@ -32,8 +32,7 @@ use chrono::{DateTime, Utc}; fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { ArrowError::CastError(format!( - "Cannot cast to {} at col index: {} row index: {}", - dt, col_idx, row_idx + "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" )) } diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index ab389751ff22..94620be6629f 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -428,8 +428,7 @@ mod tests { fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError { ArrowError::CastError(format!( - "Cannot cast to {} at col index: {} row index: {}", - dt, col_idx, row_idx + "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}" )) }