From 6294ce702be04fac1b39bc7543a8ec74fb7f85d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 25 Apr 2022 18:44:17 +0200 Subject: [PATCH 01/13] Add the skeleton for the Delimited File_Type --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index de0b87f3260b..dd6808d79a07 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -1,4 +1,5 @@ from Standard.Base import all +import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding @@ -45,3 +46,55 @@ type Text read : File -> Problem_Behavior -> Any read file on_problems = file.read_text this.encoding on_problems + +## Read delimited files such as CSVs into a Table. +type Delimited + ## Read delimited files such as CSVs into a Table. + + If a row does not match the first row's column count, the function raises + an `Invalid_Row`. If a quote is opened and never closed, a + `Mismatched_Quote` warning occurs. + + Arguments: + - delimiter: The delimiter character to split the file into columns. An + `Illegal_Argument_Error` error is returned if this is an empty string. + - encoding: The encoding to use when reading the file. + - quote: The quote character denotes the start and end of a quoted value. + No quote character is used if set to `Nothing` or an empty value. + Quoted items are not split on the delimiter and can also contain + newlines. Within a quoted value, two consecutive quote characters are + interpreted as an instance of the quote character. Empty strings must + be quoted (e.g. "") as otherwise an empty value is treated as + `Nothing`. + - headers: If set to `True`, the first row is used as column names. If + set to `False`, the column names are generated by adding increasing + numeric suffixes to the base name `Column` (i.e. `Column_1`, + `Column_2` etc.). If set to `Infer`, the process tries to infer if + headers are present on the first row (`Infer` is not implemented yet). + - parse_values: The output columns are parsed using the default `Parser` + if 'True'. If more control over parsing is needed, the + `Table.parse_values` method allows full specifications of the parser + options. + - skip_rows: The number of rows to skip from the top of the file. + - row_limit: The maximum number of rows to read from the file. + + TODO [RW] The default for `headers` is temporarily changed to `False`, + because `Infer` is not supported. It should be changed to be the default + value once the corrresponding task is implemented: + https://www.pivotaltracker.com/story/show/181986831 + + TODO [RW] The default for `parse_values` is temporarily changed to + `False`, because this feature is not yet implemented. It should be + changed to `True` once the related task is implemented: + https://www.pivotaltracker.com/story/show/181824146 + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (headers:True|False|Infer=Infer) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) + + ## Implements the `File.read` for this `File_Format` + read : File -> Problem_Behavior -> Any + read file on_problems = + Errors.unimplemented "Not implemented yet." + +## FIXME [RW] where to put these? +type Infer +type Invalid_Row (index:Integer) (row:Text) +type Mismatched_Quote From c3113bb1fe7c9994976f802f228071bf120d34bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 26 Apr 2022 14:07:40 +0200 Subject: [PATCH 02/13] WIP preparing the decoder --- .../Standard/Table/0.0.0-dev/src/Io/Csv.enso | 15 +++-- .../Table/0.0.0-dev/src/Io/File_Format.enso | 29 ++++++--- .../org/enso/table/read/DelimitedReader.java | 62 +++++++++++++++++++ test/Table_Tests/src/Csv_Spec.enso | 2 + 4 files changed, 92 insertions(+), 16 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso index 6d46c4535056..00487b146d86 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Csv.enso @@ -64,19 +64,18 @@ from_csv : File.File | Text -> Boolean -> Text -> Table ! Parse_Error from_csv csv has_header=True prefix='C' = parser_inst = Parser.create has_header prefix - handle_error error = case error of - Polyglot_Error err -> Error.throw (Parse_Error err.getMessage) - _ -> Panic.throw error + handle_error caught_panic = + Parse_Error caught_panic.payload.cause.getMessage case csv of Text -> input_stream = ByteArrayInputStream.new csv.utf_8.to_array - Panic.recover Any Table.Table (parser_inst.parse input_stream) . catch handle_error + Panic.catch Polyglot_Error (Table.Table (parser_inst.parse input_stream)) handle_error File.File _ -> - maybe_err = Panic.recover Any <| csv.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - Table.Table (parser_inst.parse java_stream) - maybe_err.catch handle_error + Panic.catch Polyglot_Error handler=handle_error <| + csv.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Table.Table (parser_inst.parse java_stream) _ -> found_type_name = Meta.get_qualified_type_name csv file_name = Meta.get_qualified_type_name File.File diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index dd6808d79a07..9b58eaaec212 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -2,6 +2,7 @@ from Standard.Base import all import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +polyglot java import org.enso.table.read.DelimitedReader ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -20,6 +21,8 @@ type Auto output = Ref.new File_Format.Bytes if ".txt".equals_ignore_case extension then Ref.put output File_Format.Text if ".log".equals_ignore_case extension then Ref.put output File_Format.Text + if ".csv".equals_ignore_case extension then Ref.put output (File_Format.Delimited ',') + if ".tsv".equals_ignore_case extension then Ref.put output (File_Format.Delimited '\t') Ref.get output @@ -60,12 +63,11 @@ type Delimited `Illegal_Argument_Error` error is returned if this is an empty string. - encoding: The encoding to use when reading the file. - quote: The quote character denotes the start and end of a quoted value. - No quote character is used if set to `Nothing` or an empty value. - Quoted items are not split on the delimiter and can also contain - newlines. Within a quoted value, two consecutive quote characters are - interpreted as an instance of the quote character. Empty strings must - be quoted (e.g. "") as otherwise an empty value is treated as - `Nothing`. + No quote character is used if set to `Nothing`. Quoted items are not + split on the delimiter and can also contain newlines. Within a quoted + value, two consecutive quote characters are interpreted as an instance + of the quote character. Empty input strings must be quoted (e.g. "") as + otherwise an empty value is treated as `Nothing`. - headers: If set to `True`, the first row is used as column names. If set to `False`, the column names are generated by adding increasing numeric suffixes to the base name `Column` (i.e. `Column_1`, @@ -87,12 +89,23 @@ type Delimited `False`, because this feature is not yet implemented. It should be changed to `True` once the related task is implemented: https://www.pivotaltracker.com/story/show/181824146 - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (headers:True|False|Infer=Infer) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any read file on_problems = - Errors.unimplemented "Not implemented yet." + java_headers = case this.headers of + True -> DelimitedReader.Headers.USE_FIRST_ROW_AS_HEADERS + Infer -> Errors.unimplemented "Inferring headers is not implemented yet." + False -> DelimitedReader.Headers.GENERATE_HEADERS + if parse_values then Errors.unimplemented "Parsing values is not implemented yet." else + if encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else + reader = DelimitedReader.new this.delimiter this.quote java_headers + ## TODO test and handle errors like file not found or access denied etc. + Panic.catch Illegal_Argument_Error handler=(caught-> Error.throw (Illegal_Argument_Error caught.payload.cause.getMessage)) <| + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Table.Table (reader.read java_stream) ## FIXME [RW] where to put these? type Infer diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java new file mode 100644 index 000000000000..b470500e9b09 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -0,0 +1,62 @@ +package org.enso.table.read; + +import java.io.InputStream; +import org.enso.table.data.table.Table; + +public class DelimitedReader { + public static enum Headers { + INFER, + USE_FIRST_ROW_AS_HEADERS, + GENERATE_HEADERS + } + + private final char delimiter; + private final char quoteCharacter; + private final Headers headers; + + private static final char noQuoteCharacter = '\0'; + + boolean hasQuote() { + return quoteCharacter != noQuoteCharacter; + } + + public DelimitedReader(String delimiter, String quote, Headers headers, int skipRows, int rowLimit) { + if (delimiter.isEmpty()) { + throw new IllegalArgumentException("Empty delimiters are not supported."); + } + if (delimiter.length() > 1) { + throw new IllegalArgumentException( + "Delimiters consisting of multiple characters or code units are not supported."); + } + + this.delimiter = delimiter.charAt(0); + + if (quote != null) { + if (quote.isEmpty()) { + throw new IllegalArgumentException( + "Empty quotes are not supported. Set the quote to `Nothing` to disable quotes."); + } + if (quote.length() > 1) { + throw new IllegalArgumentException( + "Quotes consisting of multiple characters or code units are not supported."); + } + + quoteCharacter = quote.charAt(0); + if (quoteCharacter == noQuoteCharacter) { + throw new IllegalArgumentException("Illegal quote character."); + } + } else { + quoteCharacter = noQuoteCharacter; + } + + this.headers = headers; + + if (headers == Headers.INFER) { + throw new IllegalStateException("Inferring headers is not yet implemented"); + } + } + + public Table read(InputStream inputStream) { + throw new RuntimeException("TODO"); + } +} diff --git a/test/Table_Tests/src/Csv_Spec.enso b/test/Table_Tests/src/Csv_Spec.enso index f19a90dc8327..c0331c4d3fdc 100644 --- a/test/Table_Tests/src/Csv_Spec.enso +++ b/test/Table_Tests/src/Csv_Spec.enso @@ -172,3 +172,5 @@ spec = out_1.delete_if_exists out_2.delete_if_exists out_3.delete_if_exists + +main = Test.Suite.run_main here.spec From 6661caf877bfc33a184f5455823c4cb618631d4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Apr 2022 00:18:42 +0200 Subject: [PATCH 03/13] Implement a prototype of Csv Parsing. Most of functionality is there but needs testing and some error handling is still missing. --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 28 +++- .../runtime/src/main/resources/Builtins.enso | 5 +- .../org/enso/table/read/DelimitedReader.java | 119 +++++++++++++- .../java/org/enso/table/read/InvalidRow.java | 11 ++ test/Table_Tests/src/Delimited_Read_Spec.enso | 151 ++++++++++++++++++ test/Table_Tests/src/In_Memory_Tests.enso | 2 + 6 files changed, 299 insertions(+), 17 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java create mode 100644 test/Table_Tests/src/Delimited_Read_Spec.enso diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 9b58eaaec212..d3be0d0a93b2 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -1,4 +1,6 @@ from Standard.Base import all +import Standard.Table + import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding @@ -95,19 +97,31 @@ type Delimited read : File -> Problem_Behavior -> Any read file on_problems = java_headers = case this.headers of - True -> DelimitedReader.Headers.USE_FIRST_ROW_AS_HEADERS + True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS Infer -> Errors.unimplemented "Inferring headers is not implemented yet." - False -> DelimitedReader.Headers.GENERATE_HEADERS - if parse_values then Errors.unimplemented "Parsing values is not implemented yet." else - if encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else - reader = DelimitedReader.new this.delimiter this.quote java_headers + False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS + skip_rows = case this.skip_rows of + Nothing -> 0 + Integer -> skip_rows + _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") + row_limit = case this.row_limit of + Nothing -> -1 + Integer -> row_limit + _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") + if this.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else + if this.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else + reader = DelimitedReader.new this.delimiter this.quote java_headers skip_rows row_limit ## TODO test and handle errors like file not found or access denied etc. - Panic.catch Illegal_Argument_Error handler=(caught-> Error.throw (Illegal_Argument_Error caught.payload.cause.getMessage)) <| + translate_illegal_argument caught = + Error.throw (Illegal_Argument_Error caught.payload.cause.getMessage) + #wrap_invalid_row caught = + # Error.throw (Invalid_Row) + Panic.catch Illegal_Argument_Error handler=translate_illegal_argument <| file.with_input_stream [File.Option.Read] stream-> stream.with_java_stream java_stream-> Table.Table (reader.read java_stream) ## FIXME [RW] where to put these? type Infer -type Invalid_Row (index:Integer) (row:Text) +type Invalid_Row (index : Integer) (row : [Text]) type Mismatched_Quote diff --git a/engine/runtime/src/main/resources/Builtins.enso b/engine/runtime/src/main/resources/Builtins.enso index 4fd145412268..d94f9decf788 100644 --- a/engine/runtime/src/main/resources/Builtins.enso +++ b/engine/runtime/src/main/resources/Builtins.enso @@ -309,10 +309,11 @@ type Inexhaustive_Pattern_Match_Error scrutinee does not match the expected number of arguments. Arguments: - - expected: the expected number of arguments. + - expected_min: the minimum expected number of arguments. + - expected_max: the maximum expected number of arguments. - actual: the actual number of arguments passed. @Builtin_Type -type Arity_Error expected actual +type Arity_Error expected_min expected_max actual ## The error thrown when the program attempts to read from a state slot that has not yet been initialized. diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index b470500e9b09..aabdf57de707 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -1,10 +1,22 @@ package org.enso.table.read; +import com.univocity.parsers.csv.CsvFormat; +import com.univocity.parsers.csv.CsvParser; +import com.univocity.parsers.csv.CsvParserSettings; import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.enso.table.data.column.builder.string.StorageBuilder; +import org.enso.table.data.column.builder.string.StringStorageBuilder; +import org.enso.table.data.column.storage.Storage; +import org.enso.table.data.index.DefaultIndex; +import org.enso.table.data.table.Column; import org.enso.table.data.table.Table; +import org.enso.table.util.NameDeduplicator; public class DelimitedReader { - public static enum Headers { + public enum HeaderBehavior { INFER, USE_FIRST_ROW_AS_HEADERS, GENERATE_HEADERS @@ -12,7 +24,9 @@ public static enum Headers { private final char delimiter; private final char quoteCharacter; - private final Headers headers; + private final HeaderBehavior headerBehavior; + private final long skipRows; + private final long rowLimit; private static final char noQuoteCharacter = '\0'; @@ -20,7 +34,12 @@ boolean hasQuote() { return quoteCharacter != noQuoteCharacter; } - public DelimitedReader(String delimiter, String quote, Headers headers, int skipRows, int rowLimit) { + public DelimitedReader( + String delimiter, + String quote, + HeaderBehavior headerBehavior, + long skipRows, + long rowLimit) { if (delimiter.isEmpty()) { throw new IllegalArgumentException("Empty delimiters are not supported."); } @@ -49,14 +68,98 @@ public DelimitedReader(String delimiter, String quote, Headers headers, int skip quoteCharacter = noQuoteCharacter; } - this.headers = headers; + this.headerBehavior = headerBehavior; + this.skipRows = skipRows; + this.rowLimit = rowLimit; + } - if (headers == Headers.INFER) { - throw new IllegalStateException("Inferring headers is not yet implemented"); - } + private CsvParser setupCsvParser(InputStream inputStream) { + CsvParserSettings settings = new CsvParserSettings(); + settings.setHeaderExtractionEnabled(false); + CsvFormat format = new CsvFormat(); + format.setDelimiter(delimiter); + format.setQuote(quoteCharacter); + format.setQuoteEscape(quoteCharacter); + settings.setFormat(format); + settings.setMaxCharsPerColumn(-1); + CsvParser parser = new CsvParser(settings); + parser.beginParsing(inputStream); + return parser; } public Table read(InputStream inputStream) { - throw new RuntimeException("TODO"); + CsvParser parser = setupCsvParser(inputStream); + + List headerNames; + String[] currentRow = parser.parseNext(); + + // Skip the first N rows. + for (long i = 0; currentRow != null && i < skipRows; ++i) { + currentRow = parser.parseNext(); + } + + // If there are no rows to even infer the headers, we return an empty table. + if (currentRow == null) { + return new Table(new Column[0]); + } + + switch (headerBehavior) { + case INFER: + throw new IllegalStateException("Inferring headers is not yet implemented"); + case USE_FIRST_ROW_AS_HEADERS: + headerNames = NameDeduplicator.deduplicate(Arrays.asList(currentRow)); + // We have 'used up' the first row, so we load a next one. + currentRow = parser.parseNext(); + break; + case GENERATE_HEADERS: + headerNames = new ArrayList<>(currentRow.length); + for (int i = 0; i < currentRow.length; ++i) { + headerNames.add("Column_" + (i + 1)); + } + break; + default: + throw new IllegalStateException("Impossible branch."); + } + + StorageBuilder[] builders = initBuilders(currentRow.length); + + // TODO [RW] do we index rows from 0 or 1? do we include the header/skipped rows in position of + // error? + long index = 0; + + while (currentRow != null && (rowLimit < 0 || index < rowLimit)) { + if (currentRow.length != builders.length) { + // TODO [RW] do we throw or report as warnings and continue? + throw new InvalidRow(index, currentRow); + } else { + for (int i = 0; i < builders.length; i++) { + // TODO [RW] here we can plug-in some parsing logic, to be done as part of + // https://www.pivotaltracker.com/story/show/181824146 + String item = currentRow[i]; + builders[i] = builders[i].parseAndAppend(item); + } + } + + currentRow = parser.parseNext(); + index++; + } + + // TODO [RW] check if this should be a try-finally + parser.stopParsing(); + + Column[] columns = new Column[builders.length]; + for (int i = 0; i < builders.length; i++) { + Storage col = builders[i].seal(); + columns[i] = new Column(headerNames.get(i), new DefaultIndex(col.size()), col); + } + return new Table(columns); + } + + private StorageBuilder[] initBuilders(int count) { + StorageBuilder[] res = new StorageBuilder[count]; + for (int i = 0; i < count; i++) { + res[i] = new StringStorageBuilder(); + } + return res; } } diff --git a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java new file mode 100644 index 000000000000..936396a999eb --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java @@ -0,0 +1,11 @@ +package org.enso.table.read; + +public class InvalidRow extends RuntimeException { + public final long index; + public final String[] row; + + public InvalidRow(long index, String[] row) { + this.index = index; + this.row = row; + } +} diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso new file mode 100644 index 000000000000..0384b37d5961 --- /dev/null +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -0,0 +1,151 @@ +from Standard.Base import all + +import Standard.Table +import Standard.Table.Data.Column +import Standard.Table.Io.File_Format +import Standard.Base.Error.Problem_Behavior +import Standard.Test +import project.Util + +spec = + c_1 = ["a", [1, 4, 7, 10]] + c_2 = ["b", [2, Nothing, 8, 11]] + c_3 = ["c", [Nothing, 6, 9, 12]] + expected_table = Table.new [c_1, c_2, c_3] + + Test.group "Delimited File Parsing" <| + Test.specify "should parse a simple numeric table" <| + simple_empty = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + simple_empty.should_equal expected_table + +# Test.specify "should handle duplicated columns" <| +# csv = """ +# name,x,y,x,y +# foo,10,20,30,20 +# t = Table.from_csv csv +# t.columns.map .name . should_equal ['name', 'x', 'y', 'x 1', 'y 1'] +# +# Test.group 'Writing' <| +# Test.specify 'should properly serialize simple tables' <| +# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False +# res = varied_column.to_csv +# exp = ''' +# C0,C1,C2,C3,C4,C5 +# 2005-02-25,2005-02-25,1,1,1.0,1 +# 2005-02-28,2005-02-28,2,2,2.0,2 +# 4,2005-03-01,3,3,3.0,3 +# 2005-03-02,,4,4,4.0,4 +# ,2005-03-03,5,5,5.0,5 +# 2005-03-04,2005-03-04,,6,6.25,6.25 +# 2005-03-07,2005-03-07,7,7,7.0,7 +# 2005-03-08,2005-03-08,8,8,8.0,osiem\n +# res.should_equal exp +# +# Test.specify 'should properly handle quoting of records and allow specifying separators and newlines' <| +# c1 = ['name', ['Robert");DROP TABLE Students;--', 'This;Name;;Is""Strange', 'Marcin,,']] +# c2 = ['grade', [10, 20, 'hello;world']] +# t = Table.new [c1, c2] +# +# expected_wrong_newline = """ +# name;grade +# "Robert"");DROP TABLE Students;--";10 +# "This;Name;;Is""""Strange";20 +# Marcin,,;"hello;world" +# +# expected = expected_wrong_newline.lines . join '\r\n' +# res = t.to_csv separator=';' line_ending=Line_Ending_Style.Windows +# res.should_equal expected+'\r\n' +# +# Test.specify 'should allow forced quoting of records' +# c1 = ['name', ['Robert");DROP TABLE Students;--', 'This;Name;;Is""Strange', 'Marcin,,']] +# c2 = ['grade', [10, 20, 'hello;world']] +# t = Table.new [c1, c2] +# +# expected = """ +# "name","grade" +# "Robert"");DROP TABLE Students;--","10" +# "This;Name;;Is""""Strange","20" +# "Marcin,,","hello;world" +# +# res = t.to_csv always_quote=True +# res.should_equal expected+'\n' +# +# +# Test.specify 'should write CSV to a file' <| +# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False +# out = Enso_Project.data / 'out.csv' +# out.delete_if_exists +# varied_column.write_csv out +# exp = ''' +# C0,C1,C2,C3,C4,C5 +# 2005-02-25,2005-02-25,1,1,1.0,1 +# 2005-02-28,2005-02-28,2,2,2.0,2 +# 4,2005-03-01,3,3,3.0,3 +# 2005-03-02,,4,4,4.0,4 +# ,2005-03-03,5,5,5.0,5 +# 2005-03-04,2005-03-04,,6,6.25,6.25 +# 2005-03-07,2005-03-07,7,7,7.0,7 +# 2005-03-08,2005-03-08,8,8,8.0,osiem\n +# out.read_text.should_equal exp +# out.delete_if_exists +# +# Test.specify 'should write CSV to multiple files, when row limit specified' <| +# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False +# out = Enso_Project.data / 'out.csv' +# out_1 = Enso_Project.data / 'out_1.csv' +# out_2 = Enso_Project.data / 'out_2.csv' +# out_3 = Enso_Project.data / 'out_3.csv' +# out_1.delete_if_exists +# out_2.delete_if_exists +# out_3.delete_if_exists +# varied_column.write_csv out max_rows_per_file=3 +# exp_1 = ''' +# C0,C1,C2,C3,C4,C5 +# 2005-02-25,2005-02-25,1,1,1.0,1 +# 2005-02-28,2005-02-28,2,2,2.0,2 +# 4,2005-03-01,3,3,3.0,3\n +# exp_2 = ''' +# C0,C1,C2,C3,C4,C5 +# 2005-03-02,,4,4,4.0,4 +# ,2005-03-03,5,5,5.0,5 +# 2005-03-04,2005-03-04,,6,6.25,6.25\n +# exp_3 = ''' +# C0,C1,C2,C3,C4,C5 +# 2005-03-07,2005-03-07,7,7,7.0,7 +# 2005-03-08,2005-03-08,8,8,8.0,osiem\n +# out_1.read_text.should_equal exp_1 +# out_2.read_text.should_equal exp_2 +# out_3.read_text.should_equal exp_3 +# out_1.delete_if_exists +# out_2.delete_if_exists +# out_3.delete_if_exists +# +# Test.specify 'should be possible through the write method' <| +# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False +# out = Enso_Project.data / 'out.csv' +# out_1 = Enso_Project.data / 'out_1.csv' +# out_2 = Enso_Project.data / 'out_2.csv' +# out_3 = Enso_Project.data / 'out_3.csv' +# out_1.delete_if_exists +# out_2.delete_if_exists +# out_3.delete_if_exists +# varied_column.write out (Table.Format.Csv include_header=False separator=';' max_rows_per_file=3) +# exp_1 = ''' +# 2005-02-25;2005-02-25;1;1;1.0;1 +# 2005-02-28;2005-02-28;2;2;2.0;2 +# 4;2005-03-01;3;3;3.0;3\n +# exp_2 = ''' +# 2005-03-02;;4;4;4.0;4 +# ;2005-03-03;5;5;5.0;5 +# 2005-03-04;2005-03-04;;6;6.25;6.25\n +# exp_3 = ''' +# 2005-03-07;2005-03-07;7;7;7.0;7 +# 2005-03-08;2005-03-08;8;8;8.0;osiem\n +# out_1.read_text.should_equal exp_1 +# out_2.read_text.should_equal exp_2 +# out_3.read_text.should_equal exp_3 +# out_1.delete_if_exists +# out_2.delete_if_exists +# out_3.delete_if_exists + +main = Test.Suite.run_main here.spec diff --git a/test/Table_Tests/src/In_Memory_Tests.enso b/test/Table_Tests/src/In_Memory_Tests.enso index 0d4ab4eba35c..8f50f5bc5aa9 100644 --- a/test/Table_Tests/src/In_Memory_Tests.enso +++ b/test/Table_Tests/src/In_Memory_Tests.enso @@ -5,6 +5,7 @@ import Standard.Test import project.Model_Spec import project.Column_Spec import project.Csv_Spec +import project.Delimited_Read_Spec import project.Json_Spec import project.Table_Spec import project.Spreadsheet_Spec @@ -14,6 +15,7 @@ import project.Aggregate_Spec in_memory_spec = Column_Spec.spec Csv_Spec.spec + Delimited_Read_Spec.spec Json_Spec.spec Spreadsheet_Spec.spec Table_Spec.spec From c039f6222ca00f346a9a425dd3991cb599d70bfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 27 Apr 2022 18:38:12 +0200 Subject: [PATCH 04/13] Work on error-handling --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 28 ++++-- .../org/enso/table/read/DelimitedReader.java | 94 +++++++++++++++---- .../java/org/enso/table/read/InvalidRow.java | 10 +- .../table/read/ParsingFailedException.java | 14 +++ .../org/enso/table/read/ParsingProblem.java | 3 + test/Table_Tests/src/Delimited_Read_Spec.enso | 6 +- 6 files changed, 123 insertions(+), 32 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java create mode 100644 std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index d3be0d0a93b2..efb2228188a1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -5,6 +5,8 @@ import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding polyglot java import org.enso.table.read.DelimitedReader +polyglot java import org.enso.table.read.ParsingFailedException +polyglot java import org.enso.table.read.InvalidRow ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -70,6 +72,7 @@ type Delimited value, two consecutive quote characters are interpreted as an instance of the quote character. Empty input strings must be quoted (e.g. "") as otherwise an empty value is treated as `Nothing`. + - quote_escape: TODO - headers: If set to `True`, the first row is used as column names. If set to `False`, the column names are generated by adding increasing numeric suffixes to the base name `Column` (i.e. `Column_1`, @@ -81,6 +84,7 @@ type Delimited options. - skip_rows: The number of rows to skip from the top of the file. - row_limit: The maximum number of rows to read from the file. + - keep_invalid_rows: TODO TODO [RW] The default for `headers` is temporarily changed to `False`, because `Infer` is not supported. It should be changed to be the default @@ -91,7 +95,7 @@ type Delimited `False`, because this feature is not yet implemented. It should be changed to `True` once the related task is implemented: https://www.pivotaltracker.com/story/show/181824146 - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (keep_invalid_rows:Boolean=True) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any @@ -110,18 +114,26 @@ type Delimited _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") if this.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else if this.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else - reader = DelimitedReader.new this.delimiter this.quote java_headers skip_rows row_limit ## TODO test and handle errors like file not found or access denied etc. - translate_illegal_argument caught = - Error.throw (Illegal_Argument_Error caught.payload.cause.getMessage) - #wrap_invalid_row caught = - # Error.throw (Invalid_Row) + translate_illegal_argument caught_panic = + Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + translate_problem java_problem = + if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else + java_problem + translate_parsing_failure caught_panic = + Error.throw (translate_problem caught_panic.payload.cause.problem) + Panic.catch Illegal_Argument_Error handler=translate_illegal_argument <| + Panic.catch ParsingFailedException handler=translate_parsing_failure file.with_input_stream [File.Option.Read] stream-> stream.with_java_stream java_stream-> - Table.Table (reader.read java_stream) + warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error + reader = DelimitedReader.new java_stream this.delimiter this.quote this.quote_escape java_headers skip_rows row_limit this.keep_invalid_rows warnings_as_errors + result = Table.Table reader.read + problems = Vector.Vector reader.getReportedProblems . map translate_problem + on_problems.attach_problems_after result problems ## FIXME [RW] where to put these? type Infer -type Invalid_Row (index : Integer) (row : [Text]) +type Invalid_Row (source_file_row : Integer) (index : Integer | Nothing) (row : [Text]) type Mismatched_Quote diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index aabdf57de707..77e58259b852 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -24,9 +24,15 @@ public enum HeaderBehavior { private final char delimiter; private final char quoteCharacter; + private final char quoteEscapeCharacter; private final HeaderBehavior headerBehavior; private final long skipRows; private final long rowLimit; + private final InputStream inputStream; + private final List warnings = new ArrayList<>(); + private final CsvParser parser; + private final boolean keepInvalidRows; + private final boolean warningsAsErrors; private static final char noQuoteCharacter = '\0'; @@ -35,11 +41,15 @@ boolean hasQuote() { } public DelimitedReader( + InputStream inputStream, String delimiter, String quote, + String quoteEscape, HeaderBehavior headerBehavior, long skipRows, - long rowLimit) { + long rowLimit, + boolean keepInvalidRows, + boolean warningsAsErrors) { if (delimiter.isEmpty()) { throw new IllegalArgumentException("Empty delimiters are not supported."); } @@ -68,9 +78,29 @@ public DelimitedReader( quoteCharacter = noQuoteCharacter; } + if (quoteEscape != null) { + if (quoteEscape.isEmpty()) { + throw new IllegalArgumentException( + "Empty quote escapes are not supported. Set the escape to `Nothing` to disable escaping quotes."); + } + if (quoteEscape.length() > 1) { + throw new IllegalArgumentException( + "Quote escapes consisting of multiple characters or code units are not supported."); + } + + quoteEscapeCharacter = quoteEscape.charAt(0); + } else { + quoteEscapeCharacter = noQuoteCharacter; + } + this.headerBehavior = headerBehavior; this.skipRows = skipRows; this.rowLimit = rowLimit; + this.keepInvalidRows = keepInvalidRows; + this.warningsAsErrors = warningsAsErrors; + this.inputStream = inputStream; + + parser = setupCsvParser(inputStream); } private CsvParser setupCsvParser(InputStream inputStream) { @@ -79,7 +109,7 @@ private CsvParser setupCsvParser(InputStream inputStream) { CsvFormat format = new CsvFormat(); format.setDelimiter(delimiter); format.setQuote(quoteCharacter); - format.setQuoteEscape(quoteCharacter); + format.setQuoteEscape(quoteEscapeCharacter); settings.setFormat(format); settings.setMaxCharsPerColumn(-1); CsvParser parser = new CsvParser(settings); @@ -87,15 +117,33 @@ private CsvParser setupCsvParser(InputStream inputStream) { return parser; } - public Table read(InputStream inputStream) { - CsvParser parser = setupCsvParser(inputStream); + public List getReportedProblems() { + return warnings; + } + + private void reportProblem(ParsingProblem problem) { + if (warningsAsErrors) { + throw new ParsingFailedException(problem); + } else { + warnings.add(problem); + } + } + + private long source_row = 0; + private long target_table_index = 0; + private String[] nextRow() { + source_row++; + return parser.parseNext(); + } + + public Table read() { List headerNames; - String[] currentRow = parser.parseNext(); + String[] currentRow = nextRow(); // Skip the first N rows. for (long i = 0; currentRow != null && i < skipRows; ++i) { - currentRow = parser.parseNext(); + currentRow = nextRow(); } // If there are no rows to even infer the headers, we return an empty table. @@ -109,7 +157,7 @@ public Table read(InputStream inputStream) { case USE_FIRST_ROW_AS_HEADERS: headerNames = NameDeduplicator.deduplicate(Arrays.asList(currentRow)); // We have 'used up' the first row, so we load a next one. - currentRow = parser.parseNext(); + currentRow = nextRow(); break; case GENERATE_HEADERS: headerNames = new ArrayList<>(currentRow.length); @@ -123,14 +171,25 @@ public Table read(InputStream inputStream) { StorageBuilder[] builders = initBuilders(currentRow.length); - // TODO [RW] do we index rows from 0 or 1? do we include the header/skipped rows in position of - // error? - long index = 0; - - while (currentRow != null && (rowLimit < 0 || index < rowLimit)) { + while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { if (currentRow.length != builders.length) { - // TODO [RW] do we throw or report as warnings and continue? - throw new InvalidRow(index, currentRow); + reportProblem( + new InvalidRow(source_row, keepInvalidRows ? target_table_index : null, currentRow)); + + if (keepInvalidRows) { + for (int i = 0; i < builders.length && i < currentRow.length; i++) { + String item = currentRow[i]; + builders[i] = builders[i].parseAndAppend(item); + } + + // If the current row had less columns than expected, nulls are inserted for the missing values. + // If it had more columns, the excess columns are discarded. + for (int i = currentRow.length; i < builders.length; i++) { + builders[i] = builders[i].parseAndAppend(null); + } + + target_table_index++; + } } else { for (int i = 0; i < builders.length; i++) { // TODO [RW] here we can plug-in some parsing logic, to be done as part of @@ -138,13 +197,14 @@ public Table read(InputStream inputStream) { String item = currentRow[i]; builders[i] = builders[i].parseAndAppend(item); } + + target_table_index++; } - currentRow = parser.parseNext(); - index++; + currentRow = nextRow(); } - // TODO [RW] check if this should be a try-finally + // FIXME [RW] check if this should be a try-finally parser.stopParsing(); Column[] columns = new Column[builders.length]; diff --git a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java index 936396a999eb..cf678f475317 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java +++ b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java @@ -1,11 +1,13 @@ package org.enso.table.read; -public class InvalidRow extends RuntimeException { - public final long index; +public class InvalidRow implements ParsingProblem { + public final long source_row; + public final Long table_index; public final String[] row; - public InvalidRow(long index, String[] row) { - this.index = index; + public InvalidRow(long source_row, Long table_index, String[] row) { + this.source_row = source_row; + this.table_index = table_index; this.row = row; } } diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java new file mode 100644 index 000000000000..cae741f02537 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java @@ -0,0 +1,14 @@ +package org.enso.table.read; + +/** + * An exception thrown when a problem occured during parsing and the parser is running in a mode + * that does not try recovering, so the parsing is stopped. + */ +public class ParsingFailedException extends RuntimeException { + public final ParsingProblem problem; + + + public ParsingFailedException(ParsingProblem problem) { + this.problem = problem; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java new file mode 100644 index 000000000000..1587051dafc9 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java @@ -0,0 +1,3 @@ +package org.enso.table.read; + +public interface ParsingProblem {} diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 0384b37d5961..a7f86471807b 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -8,9 +8,9 @@ import Standard.Test import project.Util spec = - c_1 = ["a", [1, 4, 7, 10]] - c_2 = ["b", [2, Nothing, 8, 11]] - c_3 = ["c", [Nothing, 6, 9, 12]] + c_1 = ["a", ['1', '4', '7', '10']] + c_2 = ["b", ['2', Nothing, '8', '11']] + c_3 = ["c", [Nothing, '6', '9', '12']] expected_table = Table.new [c_1, c_2, c_3] Test.group "Delimited File Parsing" <| From 70a7a6da64ac30641640e305e749847a06f45cb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 00:19:08 +0200 Subject: [PATCH 05/13] Add error handling tests, fix some line numbering edge cases --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 21 +- .../org/enso/table/read/DelimitedReader.java | 6 +- test/Table_Tests/data/double_quoted.csv | 3 + test/Table_Tests/data/duplicated_columns.csv | 2 + test/Table_Tests/data/escape_quoted.csv | 3 + test/Table_Tests/data/no_quoting.csv | 2 + test/Table_Tests/data/varying_rows.csv | 7 + test/Table_Tests/src/Delimited_Read_Spec.enso | 200 ++++++------------ 8 files changed, 95 insertions(+), 149 deletions(-) create mode 100644 test/Table_Tests/data/double_quoted.csv create mode 100644 test/Table_Tests/data/duplicated_columns.csv create mode 100644 test/Table_Tests/data/escape_quoted.csv create mode 100644 test/Table_Tests/data/no_quoting.csv create mode 100644 test/Table_Tests/data/varying_rows.csv diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index efb2228188a1..ab2659804b78 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -4,9 +4,12 @@ import Standard.Table import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding + +## TODO [RW] Not sure if shouldn't move the impl. details of Delimited to a separate file polyglot java import org.enso.table.read.DelimitedReader polyglot java import org.enso.table.read.ParsingFailedException polyglot java import org.enso.table.read.InvalidRow +polyglot java import java.lang.IllegalArgumentException ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -123,15 +126,15 @@ type Delimited translate_parsing_failure caught_panic = Error.throw (translate_problem caught_panic.payload.cause.problem) - Panic.catch Illegal_Argument_Error handler=translate_illegal_argument <| - Panic.catch ParsingFailedException handler=translate_parsing_failure - file.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error - reader = DelimitedReader.new java_stream this.delimiter this.quote this.quote_escape java_headers skip_rows row_limit this.keep_invalid_rows warnings_as_errors - result = Table.Table reader.read - problems = Vector.Vector reader.getReportedProblems . map translate_problem - on_problems.attach_problems_after result problems + Panic.catch IllegalArgumentException handler=translate_illegal_argument <| + Panic.catch ParsingFailedException handler=translate_parsing_failure <| + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error + reader = DelimitedReader.new java_stream this.delimiter this.quote this.quote_escape java_headers skip_rows row_limit this.keep_invalid_rows warnings_as_errors + result = Table.Table reader.read + problems = Vector.Vector reader.getReportedProblems . map translate_problem + on_problems.attach_problems_after result problems ## FIXME [RW] where to put these? type Infer diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 77e58259b852..59562bf54fc6 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -129,11 +129,9 @@ private void reportProblem(ParsingProblem problem) { } } - private long source_row = 0; private long target_table_index = 0; private String[] nextRow() { - source_row++; return parser.parseNext(); } @@ -169,12 +167,12 @@ public Table read() { throw new IllegalStateException("Impossible branch."); } - StorageBuilder[] builders = initBuilders(currentRow.length); + StorageBuilder[] builders = initBuilders(headerNames.size()); while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { if (currentRow.length != builders.length) { reportProblem( - new InvalidRow(source_row, keepInvalidRows ? target_table_index : null, currentRow)); + new InvalidRow(parser.getContext().currentLine(), keepInvalidRows ? target_table_index : null, currentRow)); if (keepInvalidRows) { for (int i = 0; i < builders.length && i < currentRow.length; i++) { diff --git a/test/Table_Tests/data/double_quoted.csv b/test/Table_Tests/data/double_quoted.csv new file mode 100644 index 000000000000..8cb16ff567cd --- /dev/null +++ b/test/Table_Tests/data/double_quoted.csv @@ -0,0 +1,3 @@ +a,b,c +"a, x",2,3 +"""a",4,"""" diff --git a/test/Table_Tests/data/duplicated_columns.csv b/test/Table_Tests/data/duplicated_columns.csv new file mode 100644 index 000000000000..92a121e6f65a --- /dev/null +++ b/test/Table_Tests/data/duplicated_columns.csv @@ -0,0 +1,2 @@ +a,b,c,a +1,2,3,4 diff --git a/test/Table_Tests/data/escape_quoted.csv b/test/Table_Tests/data/escape_quoted.csv new file mode 100644 index 000000000000..3879fe589359 --- /dev/null +++ b/test/Table_Tests/data/escape_quoted.csv @@ -0,0 +1,3 @@ +a,b,c +"a\"b",2,3 +"a\\\"z",4,5 diff --git a/test/Table_Tests/data/no_quoting.csv b/test/Table_Tests/data/no_quoting.csv new file mode 100644 index 000000000000..00b7d2322423 --- /dev/null +++ b/test/Table_Tests/data/no_quoting.csv @@ -0,0 +1,2 @@ +a,b,c +"y,z",a diff --git a/test/Table_Tests/data/varying_rows.csv b/test/Table_Tests/data/varying_rows.csv new file mode 100644 index 000000000000..ca39455cee12 --- /dev/null +++ b/test/Table_Tests/data/varying_rows.csv @@ -0,0 +1,7 @@ +a,b,c +1,2,3,4 +1,2,3 +1,2 + +1 +1,2,3,4,5,6,7,8 diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index a7f86471807b..35d58a0f448f 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -2,150 +2,78 @@ from Standard.Base import all import Standard.Table import Standard.Table.Data.Column -import Standard.Table.Io.File_Format +from Standard.Table.Io.File_Format import Invalid_Row import Standard.Base.Error.Problem_Behavior import Standard.Test +import Standard.Test.Problems import project.Util spec = - c_1 = ["a", ['1', '4', '7', '10']] - c_2 = ["b", ['2', Nothing, '8', '11']] - c_3 = ["c", [Nothing, '6', '9', '12']] - expected_table = Table.new [c_1, c_2, c_3] - Test.group "Delimited File Parsing" <| - Test.specify "should parse a simple numeric table" <| + Test.specify "should load a simple table with headers" <| + c_1 = ["a", ['1', '4', '7', '10']] + c_2 = ["b", ['2', Nothing, '8', '11']] + c_3 = ["c", [Nothing, '6', '9', '12']] + expected_table = Table.new [c_1, c_2, c_3] simple_empty = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error simple_empty.should_equal expected_table -# Test.specify "should handle duplicated columns" <| -# csv = """ -# name,x,y,x,y -# foo,10,20,30,20 -# t = Table.from_csv csv -# t.columns.map .name . should_equal ['name', 'x', 'y', 'x 1', 'y 1'] -# -# Test.group 'Writing' <| -# Test.specify 'should properly serialize simple tables' <| -# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False -# res = varied_column.to_csv -# exp = ''' -# C0,C1,C2,C3,C4,C5 -# 2005-02-25,2005-02-25,1,1,1.0,1 -# 2005-02-28,2005-02-28,2,2,2.0,2 -# 4,2005-03-01,3,3,3.0,3 -# 2005-03-02,,4,4,4.0,4 -# ,2005-03-03,5,5,5.0,5 -# 2005-03-04,2005-03-04,,6,6.25,6.25 -# 2005-03-07,2005-03-07,7,7,7.0,7 -# 2005-03-08,2005-03-08,8,8,8.0,osiem\n -# res.should_equal exp -# -# Test.specify 'should properly handle quoting of records and allow specifying separators and newlines' <| -# c1 = ['name', ['Robert");DROP TABLE Students;--', 'This;Name;;Is""Strange', 'Marcin,,']] -# c2 = ['grade', [10, 20, 'hello;world']] -# t = Table.new [c1, c2] -# -# expected_wrong_newline = """ -# name;grade -# "Robert"");DROP TABLE Students;--";10 -# "This;Name;;Is""""Strange";20 -# Marcin,,;"hello;world" -# -# expected = expected_wrong_newline.lines . join '\r\n' -# res = t.to_csv separator=';' line_ending=Line_Ending_Style.Windows -# res.should_equal expected+'\r\n' -# -# Test.specify 'should allow forced quoting of records' -# c1 = ['name', ['Robert");DROP TABLE Students;--', 'This;Name;;Is""Strange', 'Marcin,,']] -# c2 = ['grade', [10, 20, 'hello;world']] -# t = Table.new [c1, c2] -# -# expected = """ -# "name","grade" -# "Robert"");DROP TABLE Students;--","10" -# "This;Name;;Is""""Strange","20" -# "Marcin,,","hello;world" -# -# res = t.to_csv always_quote=True -# res.should_equal expected+'\n' -# -# -# Test.specify 'should write CSV to a file' <| -# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False -# out = Enso_Project.data / 'out.csv' -# out.delete_if_exists -# varied_column.write_csv out -# exp = ''' -# C0,C1,C2,C3,C4,C5 -# 2005-02-25,2005-02-25,1,1,1.0,1 -# 2005-02-28,2005-02-28,2,2,2.0,2 -# 4,2005-03-01,3,3,3.0,3 -# 2005-03-02,,4,4,4.0,4 -# ,2005-03-03,5,5,5.0,5 -# 2005-03-04,2005-03-04,,6,6.25,6.25 -# 2005-03-07,2005-03-07,7,7,7.0,7 -# 2005-03-08,2005-03-08,8,8,8.0,osiem\n -# out.read_text.should_equal exp -# out.delete_if_exists -# -# Test.specify 'should write CSV to multiple files, when row limit specified' <| -# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False -# out = Enso_Project.data / 'out.csv' -# out_1 = Enso_Project.data / 'out_1.csv' -# out_2 = Enso_Project.data / 'out_2.csv' -# out_3 = Enso_Project.data / 'out_3.csv' -# out_1.delete_if_exists -# out_2.delete_if_exists -# out_3.delete_if_exists -# varied_column.write_csv out max_rows_per_file=3 -# exp_1 = ''' -# C0,C1,C2,C3,C4,C5 -# 2005-02-25,2005-02-25,1,1,1.0,1 -# 2005-02-28,2005-02-28,2,2,2.0,2 -# 4,2005-03-01,3,3,3.0,3\n -# exp_2 = ''' -# C0,C1,C2,C3,C4,C5 -# 2005-03-02,,4,4,4.0,4 -# ,2005-03-03,5,5,5.0,5 -# 2005-03-04,2005-03-04,,6,6.25,6.25\n -# exp_3 = ''' -# C0,C1,C2,C3,C4,C5 -# 2005-03-07,2005-03-07,7,7,7.0,7 -# 2005-03-08,2005-03-08,8,8,8.0,osiem\n -# out_1.read_text.should_equal exp_1 -# out_2.read_text.should_equal exp_2 -# out_3.read_text.should_equal exp_3 -# out_1.delete_if_exists -# out_2.delete_if_exists -# out_3.delete_if_exists -# -# Test.specify 'should be possible through the write method' <| -# varied_column = (Enso_Project.data / "varied_column.csv") . read_csv has_header=False -# out = Enso_Project.data / 'out.csv' -# out_1 = Enso_Project.data / 'out_1.csv' -# out_2 = Enso_Project.data / 'out_2.csv' -# out_3 = Enso_Project.data / 'out_3.csv' -# out_1.delete_if_exists -# out_2.delete_if_exists -# out_3.delete_if_exists -# varied_column.write out (Table.Format.Csv include_header=False separator=';' max_rows_per_file=3) -# exp_1 = ''' -# 2005-02-25;2005-02-25;1;1;1.0;1 -# 2005-02-28;2005-02-28;2;2;2.0;2 -# 4;2005-03-01;3;3;3.0;3\n -# exp_2 = ''' -# 2005-03-02;;4;4;4.0;4 -# ;2005-03-03;5;5;5.0;5 -# 2005-03-04;2005-03-04;;6;6.25;6.25\n -# exp_3 = ''' -# 2005-03-07;2005-03-07;7;7;7.0;7 -# 2005-03-08;2005-03-08;8;8;8.0;osiem\n -# out_1.read_text.should_equal exp_1 -# out_2.read_text.should_equal exp_2 -# out_3.read_text.should_equal exp_3 -# out_1.delete_if_exists -# out_2.delete_if_exists -# out_3.delete_if_exists + Test.specify "should load a simple table without headers" <| + c_1 = ["Column_1", ['a', '1', '4', '7', '10']] + c_2 = ["Column_2", ['b', '2', Nothing, '8', '11']] + c_3 = ["Column_3", ['c', Nothing, '6', '9', '12']] + expected_table = Table.new [c_1, c_2, c_3] + simple_empty = (File_Format.Delimited "," headers=False).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + simple_empty.should_equal expected_table + + Test.specify "should handle duplicated columns" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "duplicated_columns.csv") Problem_Behavior.Report_Error + table.columns.map .name . should_equal ['a', 'b', 'c', 'a 1'] + table.at 'a' . to_vector . should_equal ['1'] + table.at 'a 1' . to_vector . should_equal ['4'] + + Test.specify "should handle quotes" <| + t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "double_quoted.csv") Problem_Behavior.Report_Error + # TODO shouldn't t1 be '"a,x"'? how to deduce when to keep original quotes? + t1.at 'a' . to_vector . should_equal ['a, x', '"a'] + t1.at 'c' . to_vector . should_equal ['3', '"'] + + t2 = (File_Format.Delimited "," headers=True quote_escape="\").read (Enso_Project.data / "escape_quoted.csv") Problem_Behavior.Report_Error + t2.at 'a' . to_vector . should_equal ['a"b', 'a\\\"z'] + + t3 = (File_Format.Delimited "," quote=Nothing headers=True).read (Enso_Project.data / "no_quoting.csv") Problem_Behavior.Report_Error + t3.at 'a' . to_vector . should_equal ['"y'] + t3.at 'b' . to_vector . should_equal ['z"'] + t3.at 'c' . to_vector . should_equal ['a'] + + Test.specify "should handle too long and too short rows" <| + action keep_invalid_rows on_problems = + (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems + + tester_kept table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + # FIXME [RW] A completely empty line is ignored instead of being treated as empty row. OK? + table.at 'a' . to_vector . should_equal ['1', '1', '1', '1', '1'] + table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, '2'] + table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, '3'] + problems_kept = [Invalid_Row 2 0 ['1', '2', '3', '4'], Invalid_Row 4 2 ['1', '2'], Invalid_Row 6 3 ['1'], Invalid_Row 7 4 ['1', '2', '3', '4', '5', '6', '7', '8']] + Problems.test_problem_handling (action keep_invalid_rows=True) problems_kept tester_kept + + tester_dropped table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1'] + table.at 'b' . to_vector . should_equal ['2'] + table.at 'c' . to_vector . should_equal ['3'] + problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']] + Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped + + Test.specify "should allow to skip rows" <| + Nothing + + Test.specify "should allow to set a limit of rows to read" <| + Nothing + + Test.specify "should check arguments" <| + Nothing main = Test.Suite.run_main here.spec From af2e1f9310d348a2abcdf89ecec9e6f80e74e49b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 12:52:13 +0200 Subject: [PATCH 06/13] adding some tests --- .../org/enso/table/read/DelimitedReader.java | 7 ++++- test/Table_Tests/data/double_quoted.csv | 2 +- test/Table_Tests/data/missing_header.csv | 2 ++ test/Table_Tests/data/simple_empty.csv | 4 +-- test/Table_Tests/src/Delimited_Read_Spec.enso | 26 ++++++++++++++----- 5 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 test/Table_Tests/data/missing_header.csv diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 59562bf54fc6..d4ebd3d53e8e 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -7,6 +7,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.enso.table.data.column.builder.string.StorageBuilder; import org.enso.table.data.column.builder.string.StringStorageBuilder; import org.enso.table.data.column.storage.Storage; @@ -112,6 +114,8 @@ private CsvParser setupCsvParser(InputStream inputStream) { format.setQuoteEscape(quoteEscapeCharacter); settings.setFormat(format); settings.setMaxCharsPerColumn(-1); + settings.setSkipEmptyLines(false); + settings.setKeepQuotes(false); CsvParser parser = new CsvParser(settings); parser.beginParsing(inputStream); return parser; @@ -153,7 +157,8 @@ public Table read() { case INFER: throw new IllegalStateException("Inferring headers is not yet implemented"); case USE_FIRST_ROW_AS_HEADERS: - headerNames = NameDeduplicator.deduplicate(Arrays.asList(currentRow)); + List preprocessedHeaders = Arrays.stream(currentRow).map(s -> s == null ? "Column" : s).collect(Collectors.toList()); + headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_"); // We have 'used up' the first row, so we load a next one. currentRow = nextRow(); break; diff --git a/test/Table_Tests/data/double_quoted.csv b/test/Table_Tests/data/double_quoted.csv index 8cb16ff567cd..378b996a22c2 100644 --- a/test/Table_Tests/data/double_quoted.csv +++ b/test/Table_Tests/data/double_quoted.csv @@ -1,3 +1,3 @@ -a,b,c +a,"b",c "a, x",2,3 """a",4,"""" diff --git a/test/Table_Tests/data/missing_header.csv b/test/Table_Tests/data/missing_header.csv new file mode 100644 index 000000000000..5908fa6adc43 --- /dev/null +++ b/test/Table_Tests/data/missing_header.csv @@ -0,0 +1,2 @@ +a,,c,,d +1,2,3,4,5 diff --git a/test/Table_Tests/data/simple_empty.csv b/test/Table_Tests/data/simple_empty.csv index f93618d5d4de..6f6c21b69831 100644 --- a/test/Table_Tests/data/simple_empty.csv +++ b/test/Table_Tests/data/simple_empty.csv @@ -1,5 +1,5 @@ -a,b,c +a,b,"c" 1,2, -4,,6 +"4",,6 7,8,9 10,11,12 diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 35d58a0f448f..001eef10d505 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -26,11 +26,20 @@ spec = simple_empty = (File_Format.Delimited "," headers=False).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error simple_empty.should_equal expected_table + Test.specify "should work in presence of missing headers" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "missing_header.csv") Problem_Behavior.Report_Error + table.columns.map .name . should_equal ["a", "Column", "c", "Column_1", "d"] + table.at "a" . to_vector . should_equal ["1"] + table.at "Column" . to_vector . should_equal ["2"] + table.at "c" . to_vector . should_equal ["3"] + table.at "Column_1" . to_vector . should_equal ["4"] + table.at "d" . to_vector . should_equal ["5"] + Test.specify "should handle duplicated columns" <| table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "duplicated_columns.csv") Problem_Behavior.Report_Error - table.columns.map .name . should_equal ['a', 'b', 'c', 'a 1'] + table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1'] table.at 'a' . to_vector . should_equal ['1'] - table.at 'a 1' . to_vector . should_equal ['4'] + table.at 'a_1' . to_vector . should_equal ['4'] Test.specify "should handle quotes" <| t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "double_quoted.csv") Problem_Behavior.Report_Error @@ -46,6 +55,9 @@ spec = t3.at 'b' . to_vector . should_equal ['z"'] t3.at 'c' . to_vector . should_equal ['a'] + Test.specify "should support rows spanning multiple lines if quoted" <| + Nothing + Test.specify "should handle too long and too short rows" <| action keep_invalid_rows on_problems = (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems @@ -53,10 +65,10 @@ spec = tester_kept table = table.columns.map .name . should_equal ['a', 'b', 'c'] # FIXME [RW] A completely empty line is ignored instead of being treated as empty row. OK? - table.at 'a' . to_vector . should_equal ['1', '1', '1', '1', '1'] - table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, '2'] - table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, '3'] - problems_kept = [Invalid_Row 2 0 ['1', '2', '3', '4'], Invalid_Row 4 2 ['1', '2'], Invalid_Row 6 3 ['1'], Invalid_Row 7 4 ['1', '2', '3', '4', '5', '6', '7', '8']] + table.at 'a' . to_vector . should_equal ['1', '1', '1', Nothing, '1', '1'] + table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, Nothing, '2'] + table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, Nothing, '3'] + problems_kept = [Invalid_Row 2 0 ['1', '2', '3', '4'], Invalid_Row 4 2 ['1', '2'], Invalid_Row 5 3 [Nothing], Invalid_Row 6 4 ['1'], Invalid_Row 7 5 ['1', '2', '3', '4', '5', '6', '7', '8']] Problems.test_problem_handling (action keep_invalid_rows=True) problems_kept tester_kept tester_dropped table = @@ -64,7 +76,7 @@ spec = table.at 'a' . to_vector . should_equal ['1'] table.at 'b' . to_vector . should_equal ['2'] table.at 'c' . to_vector . should_equal ['3'] - problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']] + problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 5 Nothing [Nothing], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']] Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped Test.specify "should allow to skip rows" <| From 6ca34a344149448334a154fa0325d4c90152c282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 17:40:54 +0200 Subject: [PATCH 07/13] Refactor, add tests for edge cases and IO error handling, fix stuff --- .../Base/0.0.0-dev/src/System/File.enso | 14 +++- .../Standard/Table/0.0.0-dev/src/Error.enso | 17 +++++ .../0.0.0-dev/src/Io/Delimited_Reader.enso | 76 +++++++++++++++++++ .../Table/0.0.0-dev/src/Io/File_Format.enso | 47 ++---------- .../lib/Standard/Test/0.0.0-dev/src/Main.enso | 2 +- .../org/enso/table/read/DelimitedReader.java | 7 +- test/Table_Tests/data/empty.txt | 0 test/Table_Tests/data/many_invalid_rows.csv | 16 ++++ test/Table_Tests/data/mismatched_quote.csv | 4 + test/Table_Tests/data/multiline_quoted.csv | 5 ++ test/Table_Tests/src/Delimited_Read_Spec.enso | 58 ++++++++++++-- 11 files changed, 190 insertions(+), 56 deletions(-) create mode 100644 distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso create mode 100644 test/Table_Tests/data/empty.txt create mode 100644 test/Table_Tests/data/many_invalid_rows.csv create mode 100644 test/Table_Tests/data/mismatched_quote.csv create mode 100644 test/Table_Tests/data/multiline_quoted.csv diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso index d99646b171c9..1b64f6d80ead 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso @@ -882,10 +882,16 @@ type Input_Stream Utility method for running an action with Java exceptions mapping. handle_java_exceptions file ~action = - Panic.catch IOException handler=(caught_panic-> (Error.throw (Io_Error file "An IO error has occurred: " + caught_panic.payload.cause.getMessage))) <| - Panic.catch AccessDeniedException handler=(_-> (Error.throw (Io_Error file "You do not have permission to access the file"))) <| - Panic.catch NoSuchFileException handler=(_-> (Error.throw (File_Not_Found file))) <| - action + Panic.catch IOException action caught_panic-> + here.wrap_io_exception file caught_panic.payload.cause + +## PRIVATE + + Converts a Java `IOException` into its Enso counterpart. +wrap_io_exception file io_exception = + if Java.is_instance io_exception NoSuchFileException then Error.throw (File_Not_Found file) else + if Java.is_instance io_exception AccessDeniedException then Error.throw (Io_Error file "You do not have permission to access the file") else + Error.throw (Io_Error file "An IO error has occurred: "+io_exception.getMessage) ## PRIVATE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso index 7a49ede59fb8..f2f7522ce89f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso @@ -94,3 +94,20 @@ type Additional_Warnings (count:Integer) Additional_Warnings.to_display_text : Text Additional_Warnings.to_display_text = "There were "+this.count.to_text+" additional issues." + +## Indicates that when loading a delimited file, a row was encountered which had + too many or too few columns. + + Only the first 10 rows are reported, any additional ones are aggregated into + a single instance of `Additional_Invalid_Rows`. +type Invalid_Row (source_file_row : Integer) (index : Integer | Nothing) (row : [Text]) + +## Indicates how many additional `Invalid_Row` warnings have been suppressed. +type Additional_Invalid_Rows (count : Integer) + +## Indicates that a quote inside of a delimited file cell has been opened but + never closed. +type Mismatched_Quote + +## Indicates an unexpected parser error. +type Parser_Error cause diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso new file mode 100644 index 000000000000..e53a1ee048a6 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso @@ -0,0 +1,76 @@ +from Standard.Base import all +import Standard.Table + +import Standard.Base.Error.Extensions as Errors +from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior +from Standard.Table.Error as Table_Errors import Invalid_Row, Parser_Error +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +from Standard.Table.Io.File_Format import Infer + +polyglot java import org.enso.table.read.DelimitedReader +polyglot java import org.enso.table.read.ParsingFailedException +polyglot java import org.enso.table.read.InvalidRow +polyglot java import java.lang.IllegalArgumentException +polyglot java import java.io.IOException +polyglot java import com.univocity.parsers.common.TextParsingException +polyglot java import java.io.InputStream + +## Reads a file according to the provided format. +read_file : Delimited -> File -> Problem_Behavior -> Any +read_file format file on_problems = + if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else + ## We use the default `max_columns` setting. If we want to be able to + read files with unlimited column limits (risking OutOfMemory + exceptions), we can catch the exception indicating the limit has been + reached and restart parsing with an increased limit. + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + here.read_stream format java_stream on_problems related_file=file + +## PRIVATE + Reads an input stream according to the provided format. + + The `encoding` parameter is ignored, the provided stream should handle any + necessary decoding. + + Arguments: + - format: + - java_stream: + - on_problems: + - max_columns: + - related_file: +read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any +read_stream format java_stream on_problems max_columns=4096 related_file=Nothing = + java_headers = case format.headers of + True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS + Infer -> Errors.unimplemented "Inferring headers is not implemented yet." + False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS + skip_rows = case format.skip_rows of + Nothing -> 0 + Integer -> format.skip_rows + _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") + row_limit = case format.row_limit of + Nothing -> -1 + Integer -> format.row_limit + _ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.") + if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else + translate_illegal_argument caught_panic = + Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + translate_problem java_problem = + if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else + java_problem + translate_parsing_failure caught_panic = + Error.throw (translate_problem caught_panic.payload.cause.problem) + translate_parsing_exception caught_panic = + cause = caught_panic.payload.cause.getCause + if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else + Error.throw (Parser_Error caught_panic.payload) + + Panic.catch IllegalArgumentException handler=translate_illegal_argument <| + Panic.catch ParsingFailedException handler=translate_parsing_failure <| + Panic.catch TextParsingException handler=translate_parsing_exception <| + warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error + reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors + result = Table.Table reader.read + problems = Vector.Vector reader.getReportedProblems . map translate_problem + on_problems.attach_problems_after result problems diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index ab2659804b78..1ff81ab1ce95 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -4,12 +4,7 @@ import Standard.Table import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding - -## TODO [RW] Not sure if shouldn't move the impl. details of Delimited to a separate file -polyglot java import org.enso.table.read.DelimitedReader -polyglot java import org.enso.table.read.ParsingFailedException -polyglot java import org.enso.table.read.InvalidRow -polyglot java import java.lang.IllegalArgumentException +import Standard.Table.Io.Delimited_Reader ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and @@ -103,40 +98,8 @@ type Delimited ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any read file on_problems = - java_headers = case this.headers of - True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS - Infer -> Errors.unimplemented "Inferring headers is not implemented yet." - False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS - skip_rows = case this.skip_rows of - Nothing -> 0 - Integer -> skip_rows - _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") - row_limit = case this.row_limit of - Nothing -> -1 - Integer -> row_limit - _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.") - if this.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else - if this.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else - ## TODO test and handle errors like file not found or access denied etc. - translate_illegal_argument caught_panic = - Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) - translate_problem java_problem = - if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else - java_problem - translate_parsing_failure caught_panic = - Error.throw (translate_problem caught_panic.payload.cause.problem) - - Panic.catch IllegalArgumentException handler=translate_illegal_argument <| - Panic.catch ParsingFailedException handler=translate_parsing_failure <| - file.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error - reader = DelimitedReader.new java_stream this.delimiter this.quote this.quote_escape java_headers skip_rows row_limit this.keep_invalid_rows warnings_as_errors - result = Table.Table reader.read - problems = Vector.Vector reader.getReportedProblems . map translate_problem - on_problems.attach_problems_after result problems - -## FIXME [RW] where to put these? + Delimited_Reader.read_file this file on_problems + +## A setting to infer the default behaviour of some option. type Infer -type Invalid_Row (source_file_row : Integer) (index : Integer | Nothing) (row : [Text]) -type Mismatched_Quote + diff --git a/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso b/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso index 8447ea562838..7ae6d101e28d 100644 --- a/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso +++ b/distribution/lib/Standard/Test/0.0.0-dev/src/Main.enso @@ -165,7 +165,7 @@ fail message = Any.should_fail_with : Any -> Integer -> Assertion Any.should_fail_with matcher frames_to_skip=0 = loc = Meta.get_source_location 1+frames_to_skip - here.fail ("Expected an error " + matcher.to_text + " but none occurred (at " + loc + ").") + here.fail ("Expected an error " + matcher.to_text + " but no error occurred, instead got: " + this.to_text + " (at " + loc + ").") ## Expect a function to fail with the provided dataflow error. diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index d4ebd3d53e8e..5d158a6f11a7 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -8,7 +8,6 @@ import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import java.util.stream.Stream; import org.enso.table.data.column.builder.string.StorageBuilder; import org.enso.table.data.column.builder.string.StringStorageBuilder; import org.enso.table.data.column.storage.Storage; @@ -30,7 +29,7 @@ public enum HeaderBehavior { private final HeaderBehavior headerBehavior; private final long skipRows; private final long rowLimit; - private final InputStream inputStream; + private final int maxColumns; private final List warnings = new ArrayList<>(); private final CsvParser parser; private final boolean keepInvalidRows; @@ -50,6 +49,7 @@ public DelimitedReader( HeaderBehavior headerBehavior, long skipRows, long rowLimit, + int maxColumns, boolean keepInvalidRows, boolean warningsAsErrors) { if (delimiter.isEmpty()) { @@ -98,9 +98,9 @@ public DelimitedReader( this.headerBehavior = headerBehavior; this.skipRows = skipRows; this.rowLimit = rowLimit; + this.maxColumns = maxColumns; this.keepInvalidRows = keepInvalidRows; this.warningsAsErrors = warningsAsErrors; - this.inputStream = inputStream; parser = setupCsvParser(inputStream); } @@ -114,6 +114,7 @@ private CsvParser setupCsvParser(InputStream inputStream) { format.setQuoteEscape(quoteEscapeCharacter); settings.setFormat(format); settings.setMaxCharsPerColumn(-1); + settings.setMaxColumns(maxColumns); settings.setSkipEmptyLines(false); settings.setKeepQuotes(false); CsvParser parser = new CsvParser(settings); diff --git a/test/Table_Tests/data/empty.txt b/test/Table_Tests/data/empty.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/Table_Tests/data/many_invalid_rows.csv b/test/Table_Tests/data/many_invalid_rows.csv new file mode 100644 index 000000000000..fa7e378fea03 --- /dev/null +++ b/test/Table_Tests/data/many_invalid_rows.csv @@ -0,0 +1,16 @@ +a,b,c +0,x,y +1 +2 +3 +4 +5,u,v +6 +7 +8 +9 +10 +11 +12 +13 +14 diff --git a/test/Table_Tests/data/mismatched_quote.csv b/test/Table_Tests/data/mismatched_quote.csv new file mode 100644 index 000000000000..931de64013f6 --- /dev/null +++ b/test/Table_Tests/data/mismatched_quote.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 + abc,"d e f,ghi +7,8,9 diff --git a/test/Table_Tests/data/multiline_quoted.csv b/test/Table_Tests/data/multiline_quoted.csv new file mode 100644 index 000000000000..41924ca1e85b --- /dev/null +++ b/test/Table_Tests/data/multiline_quoted.csv @@ -0,0 +1,5 @@ +a,b,c +1,"start + +continue",3 +4,5,6 diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 001eef10d505..c685a2b0a147 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -2,7 +2,8 @@ from Standard.Base import all import Standard.Table import Standard.Table.Data.Column -from Standard.Table.Io.File_Format import Invalid_Row +from Standard.Table.Error import Invalid_Row +import Standard.Table.Io.File_Format import Standard.Base.Error.Problem_Behavior import Standard.Test import Standard.Test.Problems @@ -35,6 +36,20 @@ spec = table.at "Column_1" . to_vector . should_equal ["4"] table.at "d" . to_vector . should_equal ["5"] + Test.specify "load even an empty file" <| + table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "empty.txt") Problem_Behavior.Report_Error + table.columns.map .name . should_equal [] + table.row_count . should_equal 0 + + Test.specify "should correctly handle file opening issues" <| + nonexistent_file = Enso_Project.data / "a_filename_that_does_not_exist.foobar" + r1 = (File_Format.Delimited "," headers=True).read nonexistent_file Problem_Behavior.Report_Error + r1.should_fail_with File.File_Not_Found + + directory = Enso_Project.data + r2 = (File_Format.Delimited "," headers=True).read directory Problem_Behavior.Report_Error + r2.should_fail_with File.Io_Error + Test.specify "should handle duplicated columns" <| table = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "duplicated_columns.csv") Problem_Behavior.Report_Error table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1'] @@ -43,7 +58,6 @@ spec = Test.specify "should handle quotes" <| t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "double_quoted.csv") Problem_Behavior.Report_Error - # TODO shouldn't t1 be '"a,x"'? how to deduce when to keep original quotes? t1.at 'a' . to_vector . should_equal ['a, x', '"a'] t1.at 'c' . to_vector . should_equal ['3', '"'] @@ -56,7 +70,10 @@ spec = t3.at 'c' . to_vector . should_equal ['a'] Test.specify "should support rows spanning multiple lines if quoted" <| - Nothing + t1 = (File_Format.Delimited "," headers=True).read (Enso_Project.data / "multiline_quoted.csv") Problem_Behavior.Report_Error + t1.at 'a' . to_vector . should_equal ['1', '4'] + t1.at 'b' . to_vector . should_equal ['start\n\ncontinue', '5'] + t1.at 'c' . to_vector . should_equal ['3', '6'] Test.specify "should handle too long and too short rows" <| action keep_invalid_rows on_problems = @@ -80,12 +97,41 @@ spec = Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped Test.specify "should allow to skip rows" <| - Nothing + t1 = (File_Format.Delimited "," headers=False skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t1.at "Column_1" . to_vector . should_equal ['7', '10'] + + t2 = (File_Format.Delimited "," headers=True skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t2.columns.map .name . should_equal ['7', '8', '9'] + t2.at "7" . to_vector . should_equal ['10'] Test.specify "should allow to set a limit of rows to read" <| - Nothing + t1 = (File_Format.Delimited "," headers=False row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t1.at "Column_1" . to_vector . should_equal ['a', '1'] + + t2 = (File_Format.Delimited "," headers=True row_limit=2).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t2.at "a" . to_vector . should_equal ['1', '4'] + + t3 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t3.at "Column_1" . to_vector . should_equal ['7'] + + t4 = (File_Format.Delimited "," headers=False row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t4.columns.map .name . should_equal ['Column_1', 'Column_2', 'Column_3'] + t4.row_count . should_equal 0 + + t5 = (File_Format.Delimited "," headers=True row_limit=0).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t5.columns.map .name . should_equal ['a', 'b', 'c'] + t5.at 'a' . to_vector . should_equal [] + t5.row_count . should_equal 0 + + t6 = (File_Format.Delimited "," headers=False skip_rows=3 row_limit=1000).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error + t6.at "Column_1" . to_vector . should_equal ['7', '10'] Test.specify "should check arguments" <| - Nothing + path = (Enso_Project.data / "simple_empty.csv") + pb = Problem_Behavior.Report_Error + (File_Format.Delimited "," headers=False quote='abc').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited "," headers=False quote='🚧').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited "," headers=False quote_escape='//').read path pb . should_fail_with Illegal_Argument_Error + (File_Format.Delimited 'a\u{301}' headers=False).read path pb . should_fail_with Illegal_Argument_Error main = Test.Suite.run_main here.spec From 7bf1faf94d2cc60675fc2a6c944105dcba3cfa61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 18:26:53 +0200 Subject: [PATCH 08/13] Handle mismatched quotes --- .../Standard/Table/0.0.0-dev/src/Error.enso | 2 +- .../0.0.0-dev/src/Io/Delimited_Reader.enso | 6 +- .../org/enso/table/read/DelimitedReader.java | 67 ++++++++++++++++--- .../org/enso/table/read/MismatchedQuote.java | 3 + test/Table_Tests/data/mismatched_quote.csv | 2 +- test/Table_Tests/data/mismatched_quote2.csv | 4 ++ test/Table_Tests/src/Delimited_Read_Spec.enso | 26 ++++++- 7 files changed, 95 insertions(+), 15 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java create mode 100644 test/Table_Tests/data/mismatched_quote2.csv diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso index f2f7522ce89f..3bac2ddeaee1 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Error.enso @@ -100,7 +100,7 @@ Additional_Warnings.to_display_text = Only the first 10 rows are reported, any additional ones are aggregated into a single instance of `Additional_Invalid_Rows`. -type Invalid_Row (source_file_row : Integer) (index : Integer | Nothing) (row : [Text]) +type Invalid_Row (source_file_line_number : Integer) (index : Integer | Nothing) (row : [Text]) ## Indicates how many additional `Invalid_Row` warnings have been suppressed. type Additional_Invalid_Rows (count : Integer) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso index e53a1ee048a6..baf1a01d9492 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso @@ -3,13 +3,14 @@ import Standard.Table import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior -from Standard.Table.Error as Table_Errors import Invalid_Row, Parser_Error +from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding from Standard.Table.Io.File_Format import Infer polyglot java import org.enso.table.read.DelimitedReader polyglot java import org.enso.table.read.ParsingFailedException polyglot java import org.enso.table.read.InvalidRow +polyglot java import org.enso.table.read.MismatchedQuote polyglot java import java.lang.IllegalArgumentException polyglot java import java.io.IOException polyglot java import com.univocity.parsers.common.TextParsingException @@ -58,7 +59,8 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) translate_problem java_problem = if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else - java_problem + if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else + java_problem translate_parsing_failure caught_panic = Error.throw (translate_problem caught_panic.payload.cause.problem) translate_parsing_exception caught_panic = diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 5d158a6f11a7..7d0322e7dfdc 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -116,12 +116,56 @@ private CsvParser setupCsvParser(InputStream inputStream) { settings.setMaxCharsPerColumn(-1); settings.setMaxColumns(maxColumns); settings.setSkipEmptyLines(false); - settings.setKeepQuotes(false); + settings.setKeepQuotes(true); CsvParser parser = new CsvParser(settings); parser.beginParsing(inputStream); return parser; } + private String parseCell(String cell) { + if (cell == null) return null; + + // TODO [RW] here we can plug-in some more complex parsing logic, to be done + // as part of https://www.pivotaltracker.com/story/show/181824146 + if (cell.isEmpty()) return cell; + if (cell.charAt(0) == quoteCharacter) { + return stripQuotes(cell); + } + + return cell; + } + + private String parseHeader(String cell) { + if (cell == null) return "Column"; + + if (cell.isEmpty()) return cell; + if (cell.charAt(0) == quoteCharacter) { + return stripQuotes(cell); + } + + return cell; + } + + private String stripQuotes(String cell) { + assert cell.charAt(0) == quoteCharacter; + + if (cell.length() < 2 || cell.charAt(cell.length() - 1) != quoteCharacter) { + reportMismatchedQuote(); + return cell.substring(1); + } else { + // Strip quotes. + return cell.substring(1, cell.length() - 1); + } + } + + private void reportMismatchedQuote() { + reportProblem(new MismatchedQuote()); + } + + private void reportInvalidRow(long source_row, Long table_index, String[] row) { + reportProblem(new InvalidRow(source_row, table_index, row)); + } + public List getReportedProblems() { return warnings; } @@ -135,8 +179,10 @@ private void reportProblem(ParsingProblem problem) { } private long target_table_index = 0; + private long current_line = 0; private String[] nextRow() { + current_line = parser.getContext().currentLine() + 1; return parser.parseNext(); } @@ -158,7 +204,8 @@ public Table read() { case INFER: throw new IllegalStateException("Inferring headers is not yet implemented"); case USE_FIRST_ROW_AS_HEADERS: - List preprocessedHeaders = Arrays.stream(currentRow).map(s -> s == null ? "Column" : s).collect(Collectors.toList()); + List preprocessedHeaders = + Arrays.stream(currentRow).map(this::parseHeader).collect(Collectors.toList()); headerNames = NameDeduplicator.deduplicate(preprocessedHeaders, "_"); // We have 'used up' the first row, so we load a next one. currentRow = nextRow(); @@ -177,16 +224,19 @@ public Table read() { while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { if (currentRow.length != builders.length) { - reportProblem( - new InvalidRow(parser.getContext().currentLine(), keepInvalidRows ? target_table_index : null, currentRow)); + reportInvalidRow( + current_line, + keepInvalidRows ? target_table_index : null, + currentRow); if (keepInvalidRows) { for (int i = 0; i < builders.length && i < currentRow.length; i++) { - String item = currentRow[i]; + String item = parseCell(currentRow[i]); builders[i] = builders[i].parseAndAppend(item); } - // If the current row had less columns than expected, nulls are inserted for the missing values. + // If the current row had less columns than expected, nulls are inserted for the missing + // values. // If it had more columns, the excess columns are discarded. for (int i = currentRow.length; i < builders.length; i++) { builders[i] = builders[i].parseAndAppend(null); @@ -196,9 +246,8 @@ public Table read() { } } else { for (int i = 0; i < builders.length; i++) { - // TODO [RW] here we can plug-in some parsing logic, to be done as part of - // https://www.pivotaltracker.com/story/show/181824146 - String item = currentRow[i]; + + String item = parseCell(currentRow[i]); builders[i] = builders[i].parseAndAppend(item); } diff --git a/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java new file mode 100644 index 000000000000..969790f4e755 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java @@ -0,0 +1,3 @@ +package org.enso.table.read; + +public class MismatchedQuote implements ParsingProblem {} diff --git a/test/Table_Tests/data/mismatched_quote.csv b/test/Table_Tests/data/mismatched_quote.csv index 931de64013f6..6ba867631c5a 100644 --- a/test/Table_Tests/data/mismatched_quote.csv +++ b/test/Table_Tests/data/mismatched_quote.csv @@ -1,4 +1,4 @@ a,b,c 1,2,3 - abc,"d e f,ghi + abc,"def","g h i 7,8,9 diff --git a/test/Table_Tests/data/mismatched_quote2.csv b/test/Table_Tests/data/mismatched_quote2.csv new file mode 100644 index 000000000000..3cfe39451c72 --- /dev/null +++ b/test/Table_Tests/data/mismatched_quote2.csv @@ -0,0 +1,4 @@ +a,b,c +1,2,3 + abc,"def,g h i +7,8,9 diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index c685a2b0a147..7d7ebc8e03cf 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -2,7 +2,7 @@ from Standard.Base import all import Standard.Table import Standard.Table.Data.Column -from Standard.Table.Error import Invalid_Row +from Standard.Table.Error import all import Standard.Table.Io.File_Format import Standard.Base.Error.Problem_Behavior import Standard.Test @@ -75,13 +75,35 @@ spec = t1.at 'b' . to_vector . should_equal ['start\n\ncontinue', '5'] t1.at 'c' . to_vector . should_equal ['3', '6'] + Test.specify "should behave correctly in presence of a mismatched quote" <| + action_1 on_problems = + (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote.csv") on_problems + + tester_1 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1', 'abc'] + table.at 'b' . to_vector . should_equal ['2', 'def'] + table.at 'c' . to_vector . should_equal ['3', 'g h i\n7,8,9\n'] + problems_1 = [Mismatched_Quote] + Problems.test_problem_handling action_1 problems_1 tester_1 + + action_2 on_problems = + (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote2.csv") on_problems + + tester_2 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['1', 'abc'] + table.at 'b' . to_vector . should_equal ['2', 'def,g h i\n7,8,9\n'] + table.at 'c' . to_vector . should_equal ['3', Nothing] + problems_2 = [Invalid_Row 3 1 ['abc', '"def,g h i\n7,8,9\n'], Mismatched_Quote] + Problems.test_problem_handling action_2 problems_2 tester_2 + Test.specify "should handle too long and too short rows" <| action keep_invalid_rows on_problems = (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems tester_kept table = table.columns.map .name . should_equal ['a', 'b', 'c'] - # FIXME [RW] A completely empty line is ignored instead of being treated as empty row. OK? table.at 'a' . to_vector . should_equal ['1', '1', '1', Nothing, '1', '1'] table.at 'b' . to_vector . should_equal ['2', '2', '2', Nothing, Nothing, '2'] table.at 'c' . to_vector . should_equal ['3', '3', Nothing, Nothing, Nothing, '3'] From 9304a2d8a2f1176882661b092462c908c7202b91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 18:39:18 +0200 Subject: [PATCH 09/13] Aggregate additional invalid rows --- .../Table/0.0.0-dev/src/Io/Delimited_Reader.enso | 6 ++++-- .../enso/table/read/AdditionalInvalidRows.java | 9 +++++++++ .../org/enso/table/read/DelimitedReader.java | 16 ++++++++++++++-- test/Table_Tests/src/Delimited_Read_Spec.enso | 12 ++++++++++++ 4 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso index baf1a01d9492..d37b814e4119 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso @@ -3,7 +3,7 @@ import Standard.Table import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior -from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error +from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding from Standard.Table.Io.File_Format import Infer @@ -11,6 +11,7 @@ polyglot java import org.enso.table.read.DelimitedReader polyglot java import org.enso.table.read.ParsingFailedException polyglot java import org.enso.table.read.InvalidRow polyglot java import org.enso.table.read.MismatchedQuote +polyglot java import org.enso.table.read.AdditionalInvalidRows polyglot java import java.lang.IllegalArgumentException polyglot java import java.io.IOException polyglot java import com.univocity.parsers.common.TextParsingException @@ -60,7 +61,8 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing translate_problem java_problem = if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else - java_problem + if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else + java_problem translate_parsing_failure caught_panic = Error.throw (translate_problem caught_panic.payload.cause.problem) translate_parsing_exception caught_panic = diff --git a/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java new file mode 100644 index 000000000000..72541efbfbd6 --- /dev/null +++ b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java @@ -0,0 +1,9 @@ +package org.enso.table.read; + +public class AdditionalInvalidRows implements ParsingProblem { +public final long count; + + public AdditionalInvalidRows(long count) { + this.count = count; + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 7d0322e7dfdc..f251bce4e28b 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -162,12 +162,24 @@ private void reportMismatchedQuote() { reportProblem(new MismatchedQuote()); } + private long invalidRowsCount = 0; + private final static long invalidRowsLimit = 10; + private void reportInvalidRow(long source_row, Long table_index, String[] row) { - reportProblem(new InvalidRow(source_row, table_index, row)); + if (invalidRowsCount < invalidRowsLimit) { + reportProblem(new InvalidRow(source_row, table_index, row)); + } + + invalidRowsCount++; } public List getReportedProblems() { - return warnings; + List result = new ArrayList<>(warnings); + if (invalidRowsCount > invalidRowsLimit) { + long additionalInvalidRows = invalidRowsCount - invalidRowsLimit; + result.add(new AdditionalInvalidRows(additionalInvalidRows)); + } + return result; } private void reportProblem(ParsingProblem problem) { diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 7d7ebc8e03cf..5a89ac155a91 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -118,6 +118,18 @@ spec = problems_dropped = [Invalid_Row 2 Nothing ['1', '2', '3', '4'], Invalid_Row 4 Nothing ['1', '2'], Invalid_Row 5 Nothing [Nothing], Invalid_Row 6 Nothing ['1'], Invalid_Row 7 Nothing ['1', '2', '3', '4', '5', '6', '7', '8']] Problems.test_problem_handling (action keep_invalid_rows=False) problems_dropped tester_dropped + Test.specify "should aggregate invalid rows over some limit" <| + action on_problems = + (File_Format.Delimited "," headers=True keep_invalid_rows=False).read (Enso_Project.data / "many_invalid_rows.csv") on_problems + + tester table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['0', '5'] + table.at 'b' . to_vector . should_equal ['x', 'u'] + table.at 'c' . to_vector . should_equal ['y', 'v'] + problems = [Invalid_Row 3 Nothing ['1'], Invalid_Row 4 Nothing ['2'], Invalid_Row 5 Nothing ['3'], Invalid_Row 6 Nothing ['4'], Invalid_Row 8 Nothing ['6'], Invalid_Row 9 Nothing ['7'], Invalid_Row 10 Nothing ['8'], Invalid_Row 11 Nothing ['9'], Invalid_Row 12 Nothing ['10'], Invalid_Row 13 Nothing ['11'], Additional_Invalid_Rows 3] + Problems.test_problem_handling action problems tester + Test.specify "should allow to skip rows" <| t1 = (File_Format.Delimited "," headers=False skip_rows=3).read (Enso_Project.data / "simple_empty.csv") Problem_Behavior.Report_Error t1.at "Column_1" . to_vector . should_equal ['7', '10'] From e44e3fa3064f8020f54f11741c2126d6b66ac107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 23:25:59 +0200 Subject: [PATCH 10/13] docs --- .../0.0.0-dev/src/Io/Delimited_Reader.enso | 30 ++++++-- .../Table/0.0.0-dev/src/Io/File_Format.enso | 15 +++- .../table/read/AdditionalInvalidRows.java | 3 +- .../org/enso/table/read/DelimitedReader.java | 73 ++++++++++++++++--- .../java/org/enso/table/read/InvalidRow.java | 1 + .../org/enso/table/read/MismatchedQuote.java | 1 + .../table/read/ParsingFailedException.java | 1 - .../org/enso/table/read/ParsingProblem.java | 4 + 8 files changed, 103 insertions(+), 25 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso index d37b814e4119..d2440e5e1208 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso @@ -17,7 +17,15 @@ polyglot java import java.io.IOException polyglot java import com.univocity.parsers.common.TextParsingException polyglot java import java.io.InputStream -## Reads a file according to the provided format. +## Reads a delimited file according to the provided format. + + Arguments: + - format: The specification of the delimited file format. + - file: The file to read. + - on_problems: Specifies the behavior when a problem occurs during the + operation. By default, a warning is issued, but the operation proceeds. + If set to `Report_Error`, the operation fails with a dataflow error. + If set to `Ignore`, the operation proceeds without errors or warnings. read_file : Delimited -> File -> Problem_Behavior -> Any read_file format file on_problems = if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else @@ -32,15 +40,21 @@ read_file format file on_problems = ## PRIVATE Reads an input stream according to the provided format. - The `encoding` parameter is ignored, the provided stream should handle any - necessary decoding. + The `encoding` parameter is ignored, instead the provided stream should + handle any necessary decoding. Arguments: - - format: - - java_stream: - - on_problems: - - max_columns: - - related_file: + - format: The specification of the delimited file format. + - java_stream: A Java `InputStream` used as the data source. + - on_problems: Specifies the behavior when a problem occurs during the + operation. By default, a warning is issued, but the operation proceeds. + If set to `Report_Error`, the operation fails with a dataflow error. + If set to `Ignore`, the operation proceeds without errors or warnings. + - max_columns: Specifies the limit of columns to read. The limit is set to + avoid `OutOfMemory` errors on malformed files. It must be a positive + integer. + - related_file: The file related to the provided `java_stream`, if available, + or `Nothing`. It is used for more detailed error reporting. read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any read_stream format java_stream on_problems max_columns=4096 related_file=Nothing = java_headers = case format.headers of diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 1ff81ab1ce95..48b353d8aab4 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -70,19 +70,28 @@ type Delimited value, two consecutive quote characters are interpreted as an instance of the quote character. Empty input strings must be quoted (e.g. "") as otherwise an empty value is treated as `Nothing`. - - quote_escape: TODO + - quote_escape: The character to escape the quote character in a quoted + value. For example, if both `quote` and `quote_escape` are set to `"`, + then escaping quotes is done by double quotes: `"ab""cd"` will yield + the text `ab"cd"`. Another popular choice for `quote_escape` is the `\` + character. Then `"ab\"cd"` will yield the same text. - headers: If set to `True`, the first row is used as column names. If set to `False`, the column names are generated by adding increasing numeric suffixes to the base name `Column` (i.e. `Column_1`, `Column_2` etc.). If set to `Infer`, the process tries to infer if headers are present on the first row (`Infer` is not implemented yet). + If the column names are not unique, numeric suffixes will be appended + to disambiguate them. - parse_values: The output columns are parsed using the default `Parser` if 'True'. If more control over parsing is needed, the `Table.parse_values` method allows full specifications of the parser options. - skip_rows: The number of rows to skip from the top of the file. - - row_limit: The maximum number of rows to read from the file. - - keep_invalid_rows: TODO + - row_limit: The maximum number of rows to read from the file. This count + does not include the header row (if applicable). + - keep_invalid_rows: Specifies whether rows that contain less or more + columns than expected should be kept (setting the missing columns to + `Nothing` or dropping the excess columns) or dropped. TODO [RW] The default for `headers` is temporarily changed to `False`, because `Infer` is not supported. It should be changed to be the default diff --git a/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java index 72541efbfbd6..d5d26a41ca86 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java +++ b/std-bits/table/src/main/java/org/enso/table/read/AdditionalInvalidRows.java @@ -1,7 +1,8 @@ package org.enso.table.read; +/** A problem which indicates how many additional invalid rows were encountered. */ public class AdditionalInvalidRows implements ParsingProblem { -public final long count; + public final long count; public AdditionalInvalidRows(long count) { this.count = count; diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index f251bce4e28b..13260c00ae14 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -16,13 +16,25 @@ import org.enso.table.data.table.Table; import org.enso.table.util.NameDeduplicator; +/** A helper for reading delimited (CSV-like) files. */ public class DelimitedReader { + + /** Specifies how to set the headers for the returned table. */ public enum HeaderBehavior { + /** Tries to infer if the headers are present in the file. */ INFER, + + /** Uses the first row in the file as headers. Duplicate names will be appended suffixes. */ USE_FIRST_ROW_AS_HEADERS, + + /** + * Treats the first row as data and generates header names starting with {@code COLUMN_NAME}. + */ GENERATE_HEADERS } + private static final String COLUMN_NAME = "Column"; + private final char delimiter; private final char quoteCharacter; private final char quoteEscapeCharacter; @@ -37,10 +49,28 @@ public enum HeaderBehavior { private static final char noQuoteCharacter = '\0'; - boolean hasQuote() { - return quoteCharacter != noQuoteCharacter; - } - + /** + * Creates a new reader. + * + * @param inputStream the stream to read from + * @param delimiter the delimiter, should be a single character, but is a String for proper + * interoperability with Enso; if a string that does not fit in a single character is + * provided, an exception is raised + * @param quote the quote character to use, should be a single character or {@code null}, but is a + * String for proper interoperability with Enso; if a string that does not fit in a single + * character is provided, an exception is raised + * @param quoteEscape the quote escape character to use, should be a single character or {@code + * null}, but is a * String for proper interoperability with Enso; if a string that does not + * fit in a single * character is provided, an exception is raised + * @param headerBehavior specifies how to set the header for the resulting table + * @param skipRows specifies how many rows from the input to skip + * @param rowLimit specifies how many rows to read (does not include the header row) + * @param maxColumns specifies how many columns can be expected at most + * @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns + * @param warningsAsErrors specifies if the first warning should be immediately raised as an error + * (used as a fast-path for the error-reporting mode to avoid computing a value that is going + * to be discarded anyway) + */ public DelimitedReader( InputStream inputStream, String delimiter, @@ -105,6 +135,7 @@ public DelimitedReader( parser = setupCsvParser(inputStream); } + /** Creates a {@code CsvParser} according to the settings specified at construction. */ private CsvParser setupCsvParser(InputStream inputStream) { CsvParserSettings settings = new CsvParserSettings(); settings.setHeaderExtractionEnabled(false); @@ -122,6 +153,12 @@ private CsvParser setupCsvParser(InputStream inputStream) { return parser; } + /** + * Parses a cell. + * + *

Currently just handles quotes. In the future it will be extended to delegate to a parsing + * strategy for values. + */ private String parseCell(String cell) { if (cell == null) return null; @@ -135,8 +172,9 @@ private String parseCell(String cell) { return cell; } + /** Parses a header cell, removing surrounding quotes. */ private String parseHeader(String cell) { - if (cell == null) return "Column"; + if (cell == null) return COLUMN_NAME; if (cell.isEmpty()) return cell; if (cell.charAt(0) == quoteCharacter) { @@ -146,6 +184,12 @@ private String parseHeader(String cell) { return cell; } + /** + * If the first character of a string is a quote, will remove the surrounding quotes. + * + *

If the first character of a string is a quote but the last one is not, mismatched quote + * problem is reported. + */ private String stripQuotes(String cell) { assert cell.charAt(0) == quoteCharacter; @@ -163,7 +207,7 @@ private void reportMismatchedQuote() { } private long invalidRowsCount = 0; - private final static long invalidRowsLimit = 10; + private static final long invalidRowsLimit = 10; private void reportInvalidRow(long source_row, Long table_index, String[] row) { if (invalidRowsCount < invalidRowsLimit) { @@ -173,6 +217,7 @@ private void reportInvalidRow(long source_row, Long table_index, String[] row) { invalidRowsCount++; } + /** Returns a list of currently reported problems encountered when parsing the input. */ public List getReportedProblems() { List result = new ArrayList<>(warnings); if (invalidRowsCount > invalidRowsLimit) { @@ -191,13 +236,21 @@ private void reportProblem(ParsingProblem problem) { } private long target_table_index = 0; + + /** The line number of the start of the current row in the input file. */ private long current_line = 0; + /** + * Reads the next row and updates the current line accordingly. + * + *

Will return {@code null} if no more rows are available. + */ private String[] nextRow() { current_line = parser.getContext().currentLine() + 1; return parser.parseNext(); } + /** Reads the input stream and returns a Table. */ public Table read() { List headerNames; String[] currentRow = nextRow(); @@ -225,7 +278,7 @@ public Table read() { case GENERATE_HEADERS: headerNames = new ArrayList<>(currentRow.length); for (int i = 0; i < currentRow.length; ++i) { - headerNames.add("Column_" + (i + 1)); + headerNames.add(COLUMN_NAME + "_" + (i + 1)); } break; default: @@ -236,10 +289,7 @@ public Table read() { while (currentRow != null && (rowLimit < 0 || target_table_index < rowLimit)) { if (currentRow.length != builders.length) { - reportInvalidRow( - current_line, - keepInvalidRows ? target_table_index : null, - currentRow); + reportInvalidRow(current_line, keepInvalidRows ? target_table_index : null, currentRow); if (keepInvalidRows) { for (int i = 0; i < builders.length && i < currentRow.length; i++) { @@ -269,7 +319,6 @@ public Table read() { currentRow = nextRow(); } - // FIXME [RW] check if this should be a try-finally parser.stopParsing(); Column[] columns = new Column[builders.length]; diff --git a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java index cf678f475317..333b25a99561 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java +++ b/std-bits/table/src/main/java/org/enso/table/read/InvalidRow.java @@ -1,5 +1,6 @@ package org.enso.table.read; +/** A problem indicating that a row contained more or less columns than expected. */ public class InvalidRow implements ParsingProblem { public final long source_row; public final Long table_index; diff --git a/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java index 969790f4e755..0c303283c769 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java +++ b/std-bits/table/src/main/java/org/enso/table/read/MismatchedQuote.java @@ -1,3 +1,4 @@ package org.enso.table.read; +/** A problem indicating that a quote has been opened and never closed. */ public class MismatchedQuote implements ParsingProblem {} diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java index cae741f02537..102d1fa7dac0 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingFailedException.java @@ -7,7 +7,6 @@ public class ParsingFailedException extends RuntimeException { public final ParsingProblem problem; - public ParsingFailedException(ParsingProblem problem) { this.problem = problem; } diff --git a/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java index 1587051dafc9..fd3ada1728e4 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java +++ b/std-bits/table/src/main/java/org/enso/table/read/ParsingProblem.java @@ -1,3 +1,7 @@ package org.enso.table.read; +/** + * A parent class for parsing problems which may be reported as warnings or errors, depending on the + * setup. + */ public interface ParsingProblem {} From c4260d056f365e7950a75ce38e002e123a43191b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 28 Apr 2022 23:28:35 +0200 Subject: [PATCH 11/13] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c5965c127fbb..189102ec42ef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -114,6 +114,7 @@ - [Improved the `Range` type. Added a `down_to` counterpart to `up_to` and `with_step` allowing to change the range step.][3408] - [Aligned `Text.split` API with other methods and added `Text.lines`.][3415] +- [Implemented a basic reader for the `Delimited` file format.][3424] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -175,6 +176,7 @@ [3390]: https://github.com/enso-org/enso/pull/3390 [3408]: https://github.com/enso-org/enso/pull/3408 [3415]: https://github.com/enso-org/enso/pull/3415 +[3424]: https://github.com/enso-org/enso/pull/3424 #### Enso Compiler From 796dcb7f2a7ba01abd3f6e48e1c4d967deba8d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 29 Apr 2022 17:18:17 +0200 Subject: [PATCH 12/13] refactor --- .../src/{Io => Internal}/Delimited_Reader.enso | 0 .../Standard/Table/0.0.0-dev/src/Io/File_Format.enso | 2 +- .../java/org/enso/table/read/DelimitedReader.java | 11 ++--------- 3 files changed, 3 insertions(+), 10 deletions(-) rename distribution/lib/Standard/Table/0.0.0-dev/src/{Io => Internal}/Delimited_Reader.enso (100%) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso similarity index 100% rename from distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso rename to distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 48b353d8aab4..cad8845b3aac 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -4,7 +4,7 @@ import Standard.Table import Standard.Base.Error.Extensions as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding -import Standard.Table.Io.Delimited_Reader +import Standard.Table.Internal.Delimited_Reader ## This type needs to be here to allow for the usage of Standard.Table functions. Ideally, it would be an interface within Standard.Base and diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 13260c00ae14..b5ce43e62dfd 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -153,17 +153,10 @@ private CsvParser setupCsvParser(InputStream inputStream) { return parser; } - /** - * Parses a cell. - * - *

Currently just handles quotes. In the future it will be extended to delegate to a parsing - * strategy for values. - */ + /** Parses a cell, removing surrounding quotes (if applicable). */ private String parseCell(String cell) { if (cell == null) return null; - // TODO [RW] here we can plug-in some more complex parsing logic, to be done - // as part of https://www.pivotaltracker.com/story/show/181824146 if (cell.isEmpty()) return cell; if (cell.charAt(0) == quoteCharacter) { return stripQuotes(cell); @@ -172,7 +165,7 @@ private String parseCell(String cell) { return cell; } - /** Parses a header cell, removing surrounding quotes. */ + /** Parses a header cell, removing surrounding quotes (if applicable). */ private String parseHeader(String cell) { if (cell == null) return COLUMN_NAME; From 5a862a6342d22d7493e6007542a21ba08a323ccc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Fri, 29 Apr 2022 17:30:22 +0200 Subject: [PATCH 13/13] Revert builtins changed They block me from merging, are a micro-fix not related to the PR. Probably better addressed in the Builtins refactor anyway. --- engine/runtime/src/main/resources/Builtins.enso | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/engine/runtime/src/main/resources/Builtins.enso b/engine/runtime/src/main/resources/Builtins.enso index d94f9decf788..4fd145412268 100644 --- a/engine/runtime/src/main/resources/Builtins.enso +++ b/engine/runtime/src/main/resources/Builtins.enso @@ -309,11 +309,10 @@ type Inexhaustive_Pattern_Match_Error scrutinee does not match the expected number of arguments. Arguments: - - expected_min: the minimum expected number of arguments. - - expected_max: the maximum expected number of arguments. + - expected: the expected number of arguments. - actual: the actual number of arguments passed. @Builtin_Type -type Arity_Error expected_min expected_max actual +type Arity_Error expected actual ## The error thrown when the program attempts to read from a state slot that has not yet been initialized.