From d377b64a68b73cae30e35b4fd9943be6ff74a911 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 9 Oct 2023 17:08:41 +0100 Subject: [PATCH 1/4] Cleanup CSV WriterBuilder (#4735) --- arrow-csv/src/writer.rs | 117 ++++++++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 47 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index 840e8e8a93cc..c1008c8c2e91 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -70,11 +70,6 @@ use csv::ByteRecord; use std::io::Write; use crate::map_csv_error; - -const DEFAULT_DATE_FORMAT: &str = "%F"; -const DEFAULT_TIME_FORMAT: &str = "%T"; -const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f"; -const DEFAULT_TIMESTAMP_TZ_FORMAT: &str = "%FT%H:%M:%S.%9f%:z"; const DEFAULT_NULL_VALUE: &str = ""; /// A CSV writer @@ -97,26 +92,14 @@ pub struct Writer { /// Is the beginning-of-writer beginning: bool, /// The value to represent null entries - null_value: String, + null_value: Option, } impl Writer { /// Create a new CsvWriter from a writable object, with default options pub fn new(writer: W) -> Self { let delimiter = b','; - let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); - Writer { - writer, - has_headers: true, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - beginning: true, - null_value: DEFAULT_NULL_VALUE.to_string(), - } + WriterBuilder::new().with_delimiter(delimiter).build(writer) } /// Write a vector of record batches to a writable object @@ -138,7 +121,7 @@ impl Writer { } let options = FormatOptions::default() - .with_null(&self.null_value) + .with_null(self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE)) .with_date_format(self.date_format.as_deref()) .with_datetime_format(self.datetime_format.as_deref()) .with_timestamp_format(self.timestamp_format.as_deref()) @@ -207,9 +190,9 @@ impl RecordBatchWriter for Writer { #[derive(Clone, Debug)] pub struct WriterBuilder { /// Optional column delimiter. Defaults to `b','` - delimiter: Option, + delimiter: u8, /// Whether to write column names as file headers. Defaults to `true` - has_headers: bool, + has_header: bool, /// Optional date format for date arrays date_format: Option, /// Optional datetime format for datetime arrays @@ -227,14 +210,14 @@ pub struct WriterBuilder { impl Default for WriterBuilder { fn default() -> Self { Self { - has_headers: true, - delimiter: None, - date_format: Some(DEFAULT_DATE_FORMAT.to_string()), - datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - time_format: Some(DEFAULT_TIME_FORMAT.to_string()), - timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()), - timestamp_tz_format: Some(DEFAULT_TIMESTAMP_TZ_FORMAT.to_string()), - null_value: Some(DEFAULT_NULL_VALUE.to_string()), + has_header: true, + delimiter: b',', + date_format: None, + datetime_format: None, + time_format: None, + timestamp_format: None, + timestamp_tz_format: None, + null_value: None, } } } @@ -254,7 +237,7 @@ impl WriterBuilder { /// let file = File::create("target/out.csv").unwrap(); /// /// // create a builder that doesn't write headers - /// let builder = WriterBuilder::new().has_headers(false); + /// let builder = WriterBuilder::new().with_header(false); /// let writer = builder.build(file); /// /// writer @@ -265,48 +248,91 @@ impl WriterBuilder { } /// Set whether to write headers + #[deprecated(note = "Use Self::with_header")] + #[doc(hidden)] pub fn has_headers(mut self, has_headers: bool) -> Self { - self.has_headers = has_headers; + self.has_header = has_headers; + self + } + + /// Set whether to write the CSV file with a header + pub fn with_header(mut self, header: bool) -> Self { + self.has_header = header; self } + /// Returns `true` if this writer is configured to write a header + pub fn header(&self) -> bool { + self.has_header + } + /// Set the CSV file's column delimiter as a byte character pub fn with_delimiter(mut self, delimiter: u8) -> Self { - self.delimiter = Some(delimiter); + self.delimiter = delimiter; self } + /// Get the CSV file's column delimiter as a byte character + pub fn delimiter(&self) -> u8 { + self.delimiter + } + /// Set the CSV file's date format pub fn with_date_format(mut self, format: String) -> Self { self.date_format = Some(format); self } + /// Get the CSV file's date format if set, defaults to RFC3339 + pub fn date_format(&self) -> Option<&str> { + self.date_format.as_deref() + } + /// Set the CSV file's datetime format pub fn with_datetime_format(mut self, format: String) -> Self { self.datetime_format = Some(format); self } + /// Get the CSV file's datetime format if set, defaults to RFC3339 + pub fn datetime_format(&self) -> Option<&str> { + self.datetime_format.as_deref() + } + /// Set the CSV file's time format pub fn with_time_format(mut self, format: String) -> Self { self.time_format = Some(format); self } + /// Get the CSV file's datetime time if set, defaults to RFC3339 + pub fn time_format(&self) -> Option<&str> { + self.time_format.as_deref() + } + /// Set the CSV file's timestamp format pub fn with_timestamp_format(mut self, format: String) -> Self { self.timestamp_format = Some(format); self } + /// Get the CSV file's timestamp format if set, defaults to RFC3339 + pub fn timestamp_format(&self) -> Option<&str> { + self.timestamp_format.as_deref() + } + /// Set the value to represent null in output pub fn with_null(mut self, null_value: String) -> Self { self.null_value = Some(null_value); self } - /// Use RFC3339 format for date/time/timestamps + /// Get the value to represent null in output + pub fn null(&self) -> &str { + self.null_value.as_deref().unwrap_or(DEFAULT_NULL_VALUE) + } + + /// Use RFC3339 format for date/time/timestamps (default) pub fn with_rfc3339(mut self) -> Self { self.date_format = None; self.datetime_format = None; @@ -318,21 +344,18 @@ impl WriterBuilder { /// Create a new `Writer` pub fn build(self, writer: W) -> Writer { - let delimiter = self.delimiter.unwrap_or(b','); let mut builder = csv::WriterBuilder::new(); - let writer = builder.delimiter(delimiter).from_writer(writer); + let writer = builder.delimiter(self.delimiter).from_writer(writer); Writer { writer, - has_headers: self.has_headers, + beginning: true, + has_headers: self.has_header, date_format: self.date_format, datetime_format: self.datetime_format, time_format: self.time_format, timestamp_format: self.timestamp_format, timestamp_tz_format: self.timestamp_tz_format, - beginning: true, - null_value: self - .null_value - .unwrap_or_else(|| DEFAULT_NULL_VALUE.to_string()), + null_value: self.null_value, } } } @@ -411,11 +434,11 @@ mod tests { let expected = r#"c1,c2,c3,c4,c5,c6,c7 Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34,cupcakes -consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20,cupcakes -sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo +consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378,06:51:20,cupcakes +sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo "#; assert_eq!(expected.to_string(), String::from_utf8(buffer).unwrap()); } @@ -512,7 +535,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); let builder = WriterBuilder::new() - .has_headers(false) + .with_header(false) .with_delimiter(b'|') .with_null("NULL".to_string()) .with_time_format("%r".to_string()); @@ -560,7 +583,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03,foo ) .unwrap(); - let builder = WriterBuilder::new().has_headers(false); + let builder = WriterBuilder::new().with_header(false); let mut buf: Cursor> = Default::default(); // drop the writer early to release the borrow. From 3a2345106e0467e040225d114d2313a37631ab2f Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Mon, 9 Oct 2023 17:21:22 +0100 Subject: [PATCH 2/4] Update test --- arrow/tests/csv.rs | 42 ------------------------------------------ 1 file changed, 42 deletions(-) diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 3ee319101757..a79b6b44c2d3 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -53,48 +53,6 @@ fn test_export_csv_timestamps() { } drop(writer); - let left = "c1,c2 -2019-04-18T20:54:47.378000000+10:00,2019-04-18T10:54:47.378000000 -2021-10-30T17:59:07.000000000+11:00,2021-10-30T06:59:07.000000000\n"; - let right = String::from_utf8(sw).unwrap(); - assert_eq!(left, right); -} - -#[test] -fn test_export_csv_timestamps_using_rfc3339() { - let schema = Schema::new(vec![ - Field::new( - "c1", - DataType::Timestamp(TimeUnit::Millisecond, Some("Australia/Sydney".into())), - true, - ), - Field::new("c2", DataType::Timestamp(TimeUnit::Millisecond, None), true), - ]); - - let c1 = TimestampMillisecondArray::from( - // 1555584887 converts to 2019-04-18, 20:54:47 in time zone Australia/Sydney (AEST). - // The offset (difference to UTC) is +10:00. - // 1635577147 converts to 2021-10-30 17:59:07 in time zone Australia/Sydney (AEDT) - // The offset (difference to UTC) is +11:00. Note that daylight savings is in effect on 2021-10-30. - // - vec![Some(1555584887378), Some(1635577147000)], - ) - .with_timezone("Australia/Sydney"); - let c2 = - TimestampMillisecondArray::from(vec![Some(1555584887378), Some(1635577147000)]); - let batch = - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(c1), Arc::new(c2)]).unwrap(); - - let mut sw = Vec::new(); - let mut writer = arrow_csv::WriterBuilder::new() - .with_rfc3339() - .build(&mut sw); - let batches = vec![&batch]; - for batch in batches { - writer.write(batch).unwrap(); - } - drop(writer); - let left = "c1,c2 2019-04-18T20:54:47.378+10:00,2019-04-18T10:54:47.378 2021-10-30T17:59:07+11:00,2021-10-30T06:59:07\n"; From 0aad511a4a53d461488d4cdc766ebc6ad97f2323 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 10 Oct 2023 20:47:22 +0100 Subject: [PATCH 3/4] Review feedback --- arrow-csv/src/writer.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index c1008c8c2e91..d99850c0f73c 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -77,21 +77,21 @@ const DEFAULT_NULL_VALUE: &str = ""; pub struct Writer { /// The object to write to writer: csv::Writer, - /// Whether file should be written with headers. Defaults to `true` + /// Whether file should be written with headers, defaults to `true` has_headers: bool, - /// The date format for date arrays + /// The date format for date arrays, defaults to RFC3339 date_format: Option, - /// The datetime format for datetime arrays + /// The datetime format for datetime arrays, defaults to RFC3339 datetime_format: Option, - /// The timestamp format for timestamp arrays + /// The timestamp format for timestamp arrays, defaults to RFC3339 timestamp_format: Option, - /// The timestamp format for timestamp (with timezone) arrays + /// The timestamp format for timestamp (with timezone) arrays, defaults to RFC3339 timestamp_tz_format: Option, - /// The time format for time arrays + /// The time format for time arrays, defaults to RFC3339 time_format: Option, /// Is the beginning-of-writer beginning: bool, - /// The value to represent null entries + /// The value to represent null entries, defaults to [`DEFAULT_NULL_VALUE`] null_value: Option, } @@ -333,6 +333,7 @@ impl WriterBuilder { } /// Use RFC3339 format for date/time/timestamps (default) + #[deprecated(note = "Use WriterBuilder::default()")] pub fn with_rfc3339(mut self) -> Self { self.date_format = None; self.datetime_format = None; From ef95e73aa01960171c60fd69e21ea7997bb42eda Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 10 Oct 2023 20:59:37 +0100 Subject: [PATCH 4/4] Clippy --- arrow-csv/src/writer.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-csv/src/writer.rs b/arrow-csv/src/writer.rs index d99850c0f73c..1ca956e2c73f 100644 --- a/arrow-csv/src/writer.rs +++ b/arrow-csv/src/writer.rs @@ -676,7 +676,7 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555,23:46:03,foo let mut file = tempfile::tempfile().unwrap(); - let builder = WriterBuilder::new().with_rfc3339(); + let builder = WriterBuilder::new(); let mut writer = builder.build(&mut file); let batches = vec![&batch]; for batch in batches {