From 735375126b6e8f5b7d7271432b147172d13b148e Mon Sep 17 00:00:00 2001 From: strRM <114995819+strRM@users.noreply.github.com> Date: Tue, 24 Oct 2023 13:21:39 -0400 Subject: [PATCH] Fixed CSV read failure involving blank lines in RFC4180 mode. (#2434) * Ensure we can read blank lines in CSV files. There was a problem handling blank lines in a quoted CSV column when reading RFC4180 mode. Instead of continuing to read more lines, it dropped out assuming there was at least 1 character. Now, if the line we read contains 0 characters, it immediately goes back to read more lines. I did not notice this before, because the test did not use rfc4180 csv files for input or output. * Added a test for quotes in quoted CSV. This adds a test to make sure we can process quotes within a quoted column. Quotes in quotes need to be double-quoted. --- src/include/souffle/io/ReadStreamCSV.h | 6 ++++++ tests/semantic/CMakeLists.txt | 2 ++ tests/semantic/load11/A.csv | 2 +- tests/semantic/load11/facts/A.facts | 2 +- tests/semantic/load11/load11.dl | 6 +++--- tests/semantic/load12/A.csv | 4 ++++ tests/semantic/load12/facts/A.facts | 4 ++++ tests/semantic/load12/load12.dl | 4 ++++ tests/semantic/load12/load12.err | 0 tests/semantic/load12/load12.out | 0 tests/semantic/load13/A.csv | 1 + tests/semantic/load13/facts/A.facts | 1 + tests/semantic/load13/load13.dl | 4 ++++ tests/semantic/load13/load13.err | 0 tests/semantic/load13/load13.out | 0 15 files changed, 31 insertions(+), 5 deletions(-) create mode 100644 tests/semantic/load12/A.csv create mode 100644 tests/semantic/load12/facts/A.facts create mode 100644 tests/semantic/load12/load12.dl create mode 100644 tests/semantic/load12/load12.err create mode 100644 tests/semantic/load12/load12.out create mode 100644 tests/semantic/load13/A.csv create mode 100644 tests/semantic/load13/facts/A.facts create mode 100644 tests/semantic/load13/load13.dl create mode 100644 tests/semantic/load13/load13.err create mode 100644 tests/semantic/load13/load13.out diff --git a/src/include/souffle/io/ReadStreamCSV.h b/src/include/souffle/io/ReadStreamCSV.h index 4bb938f1a8d..a90d63e557c 100644 --- a/src/include/souffle/io/ReadStreamCSV.h +++ b/src/include/souffle/io/ReadStreamCSV.h @@ -201,6 +201,12 @@ class ReadStreamCSV : public ReadStream { pos = 0; end = line.length(); } + if (pos == end) { + // this means we've got a blank line and we need to read + // more + continue; + } + char c = line[pos++]; if (c == '"' && (pos < end) && line[pos] == '"') { // two double-quote => one double-quote diff --git a/tests/semantic/CMakeLists.txt b/tests/semantic/CMakeLists.txt index ccf9a3db93d..ac8e41e590c 100644 --- a/tests/semantic/CMakeLists.txt +++ b/tests/semantic/CMakeLists.txt @@ -118,6 +118,8 @@ positive_test(load8) positive_test(load9) negative_test(load10) positive_test(load11) +positive_test(load12) +positive_test(load13) positive_test(load_adt) positive_test(load_adt2) positive_test(load_adt3) diff --git a/tests/semantic/load11/A.csv b/tests/semantic/load11/A.csv index f0653d1feb3..e7f5349787c 100644 --- a/tests/semantic/load11/A.csv +++ b/tests/semantic/load11/A.csv @@ -1,3 +1,3 @@ "foo -bar", nothing , "one +bar","nothing","one two" diff --git a/tests/semantic/load11/facts/A.facts b/tests/semantic/load11/facts/A.facts index f0653d1feb3..a8d746ddbd8 100644 --- a/tests/semantic/load11/facts/A.facts +++ b/tests/semantic/load11/facts/A.facts @@ -1,3 +1,3 @@ "foo -bar", nothing , "one +bar",nothing,"one two" diff --git a/tests/semantic/load11/load11.dl b/tests/semantic/load11/load11.dl index f3526bbf9da..30f384469a1 100644 --- a/tests/semantic/load11/load11.dl +++ b/tests/semantic/load11/load11.dl @@ -1,4 +1,4 @@ -.decl A(x:symbol) -.input A() -.output A() +.decl A(x:symbol,y:symbol,z:symbol) +.input A(rfc4180=true) +.output A(rfc4180=true) diff --git a/tests/semantic/load12/A.csv b/tests/semantic/load12/A.csv new file mode 100644 index 00000000000..c12c209f5b7 --- /dev/null +++ b/tests/semantic/load12/A.csv @@ -0,0 +1,4 @@ +"FOO","Line1 + +Line3. +" diff --git a/tests/semantic/load12/facts/A.facts b/tests/semantic/load12/facts/A.facts new file mode 100644 index 00000000000..ba835487ca3 --- /dev/null +++ b/tests/semantic/load12/facts/A.facts @@ -0,0 +1,4 @@ +FOO,"Line1 + +Line3. +" diff --git a/tests/semantic/load12/load12.dl b/tests/semantic/load12/load12.dl new file mode 100644 index 00000000000..3c9444ad353 --- /dev/null +++ b/tests/semantic/load12/load12.dl @@ -0,0 +1,4 @@ +.decl A(x:symbol, y:symbol) +.input A(rfc4180=true) +.output A(rfc4180=true) + diff --git a/tests/semantic/load12/load12.err b/tests/semantic/load12/load12.err new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/semantic/load12/load12.out b/tests/semantic/load12/load12.out new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/semantic/load13/A.csv b/tests/semantic/load13/A.csv new file mode 100644 index 00000000000..55da891096d --- /dev/null +++ b/tests/semantic/load13/A.csv @@ -0,0 +1 @@ +foo"bar" diff --git a/tests/semantic/load13/facts/A.facts b/tests/semantic/load13/facts/A.facts new file mode 100644 index 00000000000..9a689f98a9b --- /dev/null +++ b/tests/semantic/load13/facts/A.facts @@ -0,0 +1 @@ +"foo""bar""" diff --git a/tests/semantic/load13/load13.dl b/tests/semantic/load13/load13.dl new file mode 100644 index 00000000000..b6b8f47bb46 --- /dev/null +++ b/tests/semantic/load13/load13.dl @@ -0,0 +1,4 @@ +.decl A(x:symbol) +.input A(rfc4180=true) +.output A() + diff --git a/tests/semantic/load13/load13.err b/tests/semantic/load13/load13.err new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/semantic/load13/load13.out b/tests/semantic/load13/load13.out new file mode 100644 index 00000000000..e69de29bb2d