From b5a9981c035924dd177bb97e580ff958a281737d Mon Sep 17 00:00:00 2001 From: patrick Date: Thu, 20 Jan 2022 16:58:04 -0700 Subject: [PATCH 1/2] Update DECIMAL_RE to allow scientific notation in auto inferred schemas --- arrow/src/csv/reader.rs | 7 ++++--- arrow/test/data/various_types.csv | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 269e55f05859..e0c5ccb75909 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -64,7 +64,7 @@ use std::ops::Neg; lazy_static! { static ref PARSE_DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); - static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d*\.\d+|\d+\.\d*)$").unwrap(); + static ref DECIMAL_RE: Regex = Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap(); static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") .case_insensitive(true) @@ -1570,7 +1570,7 @@ mod tests { let mut csv = builder.build(file).unwrap(); let batch = csv.next().unwrap().unwrap(); - assert_eq!(5, batch.num_rows()); + assert_eq!(7, batch.num_rows()); assert_eq!(6, batch.num_columns()); let schema = batch.schema(); @@ -1872,6 +1872,7 @@ mod tests { writeln!(csv1, "c1,c2,c3")?; writeln!(csv1, "1,\"foo\",0.5")?; writeln!(csv1, "3,\"bar\",1")?; + writeln!(csv1, "3,\"bar\",2e-06")?; // reading csv2 will set c2 to optional writeln!(csv2, "c1,c2,c3,c4")?; writeln!(csv2, "10,,3.14,true")?; @@ -1887,7 +1888,7 @@ mod tests { csv4.path().to_str().unwrap().to_string(), ], b',', - Some(3), // only csv1 and csv2 should be read + Some(4), // only csv1 and csv2 should be read true, )?; diff --git a/arrow/test/data/various_types.csv b/arrow/test/data/various_types.csv index 8f4466fbe6a4..570d07f5c221 100644 --- a/arrow/test/data/various_types.csv +++ b/arrow/test/data/various_types.csv @@ -3,4 +3,6 @@ c_int|c_float|c_string|c_bool|c_date|c_datetime 2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00 3||"3.33"|true|1969-12-31|1969-11-08T02:00:00 4|4.4||false|| -5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 \ No newline at end of file +5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 +4|4e6||false|| +4|4.0e-6||false|| \ No newline at end of file From aa5d946aa88ebb69fd9bafeb1f9805323f499060 Mon Sep 17 00:00:00 2001 From: patrick Date: Fri, 21 Jan 2022 20:27:12 -0700 Subject: [PATCH 2/2] Fixed format lint --- arrow/src/csv/reader.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index e0c5ccb75909..0ade29c98cca 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -64,7 +64,8 @@ use std::ops::Neg; lazy_static! { static ref PARSE_DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); - static ref DECIMAL_RE: Regex = Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap(); + static ref DECIMAL_RE: Regex = + Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap(); static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") .case_insensitive(true)