enso-org · mergify · Apr 29, 2022 · Apr 25, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -114,6 +114,7 @@
 - [Improved the `Range` type. Added a `down_to` counterpart to `up_to` and
   `with_step` allowing to change the range step.][3408]
 - [Aligned `Text.split` API with other methods and added `Text.lines`.][3415]
+- [Implemented a basic reader for the `Delimited` file format.][3424]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -175,6 +176,7 @@
 [3390]: https://github.com/enso-org/enso/pull/3390
 [3408]: https://github.com/enso-org/enso/pull/3408
 [3415]: https://github.com/enso-org/enso/pull/3415
+[3424]: https://github.com/enso-org/enso/pull/3424
 
 #### Enso Compiler
 

@@ -882,10 +882,16 @@ type Input_Stream
 
    Utility method for running an action with Java exceptions mapping.
 handle_java_exceptions file ~action =
-    Panic.catch IOException handler=(caught_panic-> (Error.throw (Io_Error file "An IO error has occurred: " + caught_panic.payload.cause.getMessage))) <|
-        Panic.catch AccessDeniedException handler=(_-> (Error.throw (Io_Error file "You do not have permission to access the file"))) <|
-            Panic.catch NoSuchFileException handler=(_-> (Error.throw (File_Not_Found file))) <|
-                action
+    Panic.catch IOException action caught_panic->
+        here.wrap_io_exception file caught_panic.payload.cause
+
+## PRIVATE
+
+   Converts a Java `IOException` into its Enso counterpart.
+wrap_io_exception file io_exception =
+    if Java.is_instance io_exception NoSuchFileException then Error.throw (File_Not_Found file) else
+        if Java.is_instance io_exception AccessDeniedException then Error.throw (Io_Error file "You do not have permission to access the file") else
+            Error.throw (Io_Error file "An IO error has occurred: "+io_exception.getMessage)
 
 ## PRIVATE
 

@@ -94,3 +94,20 @@ type Additional_Warnings (count:Integer)
 Additional_Warnings.to_display_text : Text
 Additional_Warnings.to_display_text =
     "There were "+this.count.to_text+" additional issues."
+
+## Indicates that when loading a delimited file, a row was encountered which had
+   too many or too few columns.
+
+   Only the first 10 rows are reported, any additional ones are aggregated into
+   a single instance of `Additional_Invalid_Rows`.
+type Invalid_Row (source_file_line_number : Integer) (index : Integer | Nothing) (row : [Text])
+
+## Indicates how many additional `Invalid_Row` warnings have been suppressed.
+type Additional_Invalid_Rows (count : Integer)
+
+## Indicates that a quote inside of a delimited file cell has been opened but
+   never closed.
+type Mismatched_Quote
+
+## Indicates an unexpected parser error.
+type Parser_Error cause
@@ -64,19 +64,18 @@ from_csv : File.File | Text -> Boolean -> Text -> Table ! Parse_Error
 from_csv csv has_header=True prefix='C' =
     parser_inst = Parser.create has_header prefix
 
-    handle_error error = case error of
-        Polyglot_Error err -> Error.throw (Parse_Error err.getMessage)
-        _ -> Panic.throw error
+    handle_error caught_panic =
+        Parse_Error caught_panic.payload.cause.getMessage
 
     case csv of
         Text ->
             input_stream = ByteArrayInputStream.new csv.utf_8.to_array
-            Panic.recover Any Table.Table (parser_inst.parse input_stream) . catch handle_error
+            Panic.catch Polyglot_Error (Table.Table (parser_inst.parse input_stream)) handle_error
         File.File _ ->
-            maybe_err = Panic.recover Any <| csv.with_input_stream [File.Option.Read] stream->
-                stream.with_java_stream java_stream->
-                    Table.Table (parser_inst.parse java_stream)
-            maybe_err.catch handle_error
+            Panic.catch Polyglot_Error handler=handle_error <|
+                csv.with_input_stream [File.Option.Read] stream->
+                    stream.with_java_stream java_stream->
+                        Table.Table (parser_inst.parse java_stream)
         _ ->
             found_type_name = Meta.get_qualified_type_name csv
             file_name = Meta.get_qualified_type_name File.File

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Delimited_Reader.enso
@@ -0,0 +1,94 @@
+from Standard.Base import all
+import Standard.Table
+
+import Standard.Base.Error.Extensions as Errors
+from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
+from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
+from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
+from Standard.Table.Io.File_Format import Infer
+
+polyglot java import org.enso.table.read.DelimitedReader
+polyglot java import org.enso.table.read.ParsingFailedException
+polyglot java import org.enso.table.read.InvalidRow
+polyglot java import org.enso.table.read.MismatchedQuote
+polyglot java import org.enso.table.read.AdditionalInvalidRows
+polyglot java import java.lang.IllegalArgumentException
+polyglot java import java.io.IOException
+polyglot java import com.univocity.parsers.common.TextParsingException
+polyglot java import java.io.InputStream
+
+## Reads a delimited file according to the provided format.
+
+   Arguments:
+   - format: The specification of the delimited file format.
+   - file: The file to read.
+   - on_problems: Specifies the behavior when a problem occurs during the
+     operation. By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
+read_file : Delimited -> File -> Problem_Behavior -> Any
+read_file format file on_problems =
+    if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else
+        ## We use the default `max_columns` setting. If we want to be able to
+           read files with unlimited column limits (risking OutOfMemory
+           exceptions), we can catch the exception indicating the limit has been
+           reached and restart parsing with an increased limit.
+        file.with_input_stream [File.Option.Read] stream->
+            stream.with_java_stream java_stream->
+                here.read_stream format java_stream on_problems related_file=file
+
+## PRIVATE
+   Reads an input stream according to the provided format.
+
+   The `encoding` parameter is ignored, instead the provided stream should
+   handle any necessary decoding.
+
+   Arguments:
+   - format: The specification of the delimited file format.
+   - java_stream: A Java `InputStream` used as the data source.
+   - on_problems: Specifies the behavior when a problem occurs during the
+     operation. By default, a warning is issued, but the operation proceeds.
+     If set to `Report_Error`, the operation fails with a dataflow error.
+     If set to `Ignore`, the operation proceeds without errors or warnings.
+   - max_columns: Specifies the limit of columns to read. The limit is set to
+     avoid `OutOfMemory` errors on malformed files. It must be a positive
+     integer.
+   - related_file: The file related to the provided `java_stream`, if available,
+     or `Nothing`. It is used for more detailed error reporting.
+read_stream : Delimited -> InputStream -> Problem_Behavior -> File | Nothing -> Any
+read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
+    java_headers = case format.headers of
+        True -> DelimitedReader.HeaderBehavior.USE_FIRST_ROW_AS_HEADERS
+        Infer -> Errors.unimplemented "Inferring headers is not implemented yet."
+        False -> DelimitedReader.HeaderBehavior.GENERATE_HEADERS
+    skip_rows = case format.skip_rows of
+        Nothing -> 0
+        Integer -> format.skip_rows
+        _ -> Error.throw (Illegal_Argument_Error "`skip_rows` should be Integer or Nothing.")
+    row_limit = case format.row_limit of
+        Nothing -> -1
+        Integer -> format.row_limit
+        _ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.")
+    if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else
+        translate_illegal_argument caught_panic =
+            Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
+        translate_problem java_problem =
+            if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else
+                if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else
+                    if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else
+                        java_problem
+        translate_parsing_failure caught_panic =
+            Error.throw (translate_problem caught_panic.payload.cause.problem)
+        translate_parsing_exception caught_panic =
+            cause = caught_panic.payload.cause.getCause
+            if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else
+                Error.throw (Parser_Error caught_panic.payload)
+
+        Panic.catch IllegalArgumentException handler=translate_illegal_argument <|
+            Panic.catch ParsingFailedException handler=translate_parsing_failure <|
+                Panic.catch TextParsingException handler=translate_parsing_exception <|
+                    warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
+                    reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
+                    result = Table.Table reader.read
+                    problems = Vector.Vector reader.getReportedProblems . map translate_problem
+                    on_problems.attach_problems_after result problems
@@ -1,6 +1,10 @@
 from Standard.Base import all
+import Standard.Table
+
+import Standard.Base.Error.Extensions as Errors
 from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
 from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
+import Standard.Table.Io.Delimited_Reader
 
 ## This type needs to be here to allow for the usage of Standard.Table
    functions. Ideally, it would be an interface within Standard.Base and
@@ -19,6 +23,8 @@ type Auto
         output = Ref.new File_Format.Bytes
         if ".txt".equals_ignore_case extension then Ref.put output File_Format.Text
         if ".log".equals_ignore_case extension then Ref.put output File_Format.Text
+        if ".csv".equals_ignore_case extension then Ref.put output (File_Format.Delimited ',')
+        if ".tsv".equals_ignore_case extension then Ref.put output (File_Format.Delimited '\t')
 
         Ref.get output
 
@@ -45,3 +51,64 @@ type Text
     read : File -> Problem_Behavior -> Any
     read file on_problems =
         file.read_text this.encoding on_problems
+
+## Read delimited files such as CSVs into a Table.
+type Delimited
+    ## Read delimited files such as CSVs into a Table.
+
+       If a row does not match the first row's column count, the function raises
+       an `Invalid_Row`. If a quote is opened and never closed, a
+       `Mismatched_Quote` warning occurs.
+
+       Arguments:
+       - delimiter: The delimiter character to split the file into columns. An
+         `Illegal_Argument_Error` error is returned if this is an empty string.
+       - encoding: The encoding to use when reading the file.
+       - quote: The quote character denotes the start and end of a quoted value.
+         No quote character is used if set to `Nothing`. Quoted items are not
+         split on the delimiter and can also contain newlines. Within a quoted
+         value, two consecutive quote characters are interpreted as an instance
+         of the quote character. Empty input strings must be quoted (e.g. "") as
+         otherwise an empty value is treated as `Nothing`.
+       - quote_escape: The character to escape the quote character in a quoted
+         value. For example, if both `quote` and `quote_escape` are set to `"`,
+         then escaping quotes is done by double quotes: `"ab""cd"` will yield
+         the text `ab"cd"`. Another popular choice for `quote_escape` is the `\`
+         character. Then `"ab\"cd"` will yield the same text.
+       - headers: If set to `True`, the first row is used as column names. If
+         set to `False`, the column names are generated by adding increasing
+         numeric suffixes to the base name `Column` (i.e. `Column_1`,
+         `Column_2` etc.). If set to `Infer`, the process tries to infer if
+         headers are present on the first row (`Infer` is not implemented yet).
+         If the column names are not unique, numeric suffixes will be appended
+         to disambiguate them.
+       - parse_values: The output columns are parsed using the default `Parser`
+         if 'True'. If more control over parsing is needed, the
+         `Table.parse_values` method allows full specifications of the parser
+         options.
+       - skip_rows: The number of rows to skip from the top of the file.
+       - row_limit: The maximum number of rows to read from the file. This count
+         does not include the header row (if applicable).
+       - keep_invalid_rows: Specifies whether rows that contain less or more
+         columns than expected should be kept (setting the missing columns to
+         `Nothing` or dropping the excess columns) or dropped.
+
+       TODO [RW] The default for `headers` is temporarily changed to `False`,
+       because `Infer` is not supported. It should be changed to be the default
+       value once the corrresponding task is implemented:
+       https://www.pivotaltracker.com/story/show/181986831
+
+       TODO [RW] The default for `parse_values` is temporarily changed to
+       `False`, because this feature is not yet implemented. It should be
+       changed to `True` once the related task is implemented:
+       https://www.pivotaltracker.com/story/show/181824146
+    type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (quote:Text|Nothing='"') (quote_escape:Text|Nothing='"') (headers:True|False|Infer=False) (parse_values:Boolean=False) (skip_rows:Integer|Nothing=Nothing) (row_limit:Integer|Nothing=Nothing) (keep_invalid_rows:Boolean=True)
+
+    ## Implements the `File.read` for this `File_Format`
+    read : File -> Problem_Behavior -> Any
+    read file on_problems =
+        Delimited_Reader.read_file this file on_problems
+
+## A setting to infer the default behaviour of some option.
+type Infer
+
@@ -165,7 +165,7 @@ fail message =
 Any.should_fail_with : Any -> Integer -> Assertion
 Any.should_fail_with matcher frames_to_skip=0 =
     loc = Meta.get_source_location 1+frames_to_skip
-    here.fail ("Expected an error " + matcher.to_text + " but none occurred (at " + loc + ").")
+    here.fail ("Expected an error " + matcher.to_text + " but no error occurred, instead got: " + this.to_text + " (at " + loc + ").")
 
 ## Expect a function to fail with the provided dataflow error.
 

diff --git a/engine/runtime/src/main/resources/Builtins.enso b/engine/runtime/src/main/resources/Builtins.enso
@@ -309,10 +309,11 @@ type Inexhaustive_Pattern_Match_Error scrutinee
    does not match the expected number of arguments.
 
    Arguments:
-     - expected: the expected number of arguments.
+     - expected_min: the minimum expected number of arguments.
+     - expected_max: the maximum expected number of arguments.
      - actual: the actual number of arguments passed.
 @Builtin_Type
-type Arity_Error expected actual
+type Arity_Error expected_min expected_max actual
 
 ## The error thrown when the program attempts to read from a state slot that has
    not yet been initialized.

@@ -0,0 +1,10 @@
+package org.enso.table.read;
+
+/** A problem which indicates how many additional invalid rows were encountered. */
+public class AdditionalInvalidRows implements ParsingProblem {
+  public final long count;
+
+  public AdditionalInvalidRows(long count) {
+    this.count = count;
+  }
+}