From e72875e4b465f9d4eae1b852051e977488f97796 Mon Sep 17 00:00:00 2001 From: Patrick More <34631716+pjmore@users.noreply.github.com> Date: Sat, 22 Jan 2022 12:18:10 -0800 Subject: [PATCH] Update DECIMAL_RE to allow scientific notation in auto inferred schemas (#1216) * Update DECIMAL_RE to allow scientific notation in auto inferred schemas * Fixed format lint --- arrow/src/csv/reader.rs | 8 +++++--- arrow/test/data/various_types.csv | 4 +++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 269e55f05859..0ade29c98cca 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -64,7 +64,8 @@ use std::ops::Neg; lazy_static! { static ref PARSE_DECIMAL_RE: Regex = Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap(); - static ref DECIMAL_RE: Regex = Regex::new(r"^-?(\d*\.\d+|\d+\.\d*)$").unwrap(); + static ref DECIMAL_RE: Regex = + Regex::new(r"^-?((\d*\.\d+|\d+\.\d*)([eE]-?\d+)?|\d+([eE]-?\d+))$").unwrap(); static ref INTEGER_RE: Regex = Regex::new(r"^-?(\d+)$").unwrap(); static ref BOOLEAN_RE: Regex = RegexBuilder::new(r"^(true)$|^(false)$") .case_insensitive(true) @@ -1570,7 +1571,7 @@ mod tests { let mut csv = builder.build(file).unwrap(); let batch = csv.next().unwrap().unwrap(); - assert_eq!(5, batch.num_rows()); + assert_eq!(7, batch.num_rows()); assert_eq!(6, batch.num_columns()); let schema = batch.schema(); @@ -1872,6 +1873,7 @@ mod tests { writeln!(csv1, "c1,c2,c3")?; writeln!(csv1, "1,\"foo\",0.5")?; writeln!(csv1, "3,\"bar\",1")?; + writeln!(csv1, "3,\"bar\",2e-06")?; // reading csv2 will set c2 to optional writeln!(csv2, "c1,c2,c3,c4")?; writeln!(csv2, "10,,3.14,true")?; @@ -1887,7 +1889,7 @@ mod tests { csv4.path().to_str().unwrap().to_string(), ], b',', - Some(3), // only csv1 and csv2 should be read + Some(4), // only csv1 and csv2 should be read true, )?; diff --git a/arrow/test/data/various_types.csv b/arrow/test/data/various_types.csv index 8f4466fbe6a4..570d07f5c221 100644 --- a/arrow/test/data/various_types.csv +++ b/arrow/test/data/various_types.csv @@ -3,4 +3,6 @@ c_int|c_float|c_string|c_bool|c_date|c_datetime 2|2.2|"2.22"|true|2020-11-08|2020-11-08T01:00:00 3||"3.33"|true|1969-12-31|1969-11-08T02:00:00 4|4.4||false|| -5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 \ No newline at end of file +5|6.6|""|false|1990-01-01|1990-01-01T03:00:00 +4|4e6||false|| +4|4.0e-6||false|| \ No newline at end of file