Delimited File Encoding (#3430)

Implements https://www.pivotaltracker.com/story/show/181998375
enso-org · May 10, 2022 · 64f178f · 64f178f
1 parent 0e904b2
commit 64f178f
Show file tree

Hide file tree

Showing 20 changed files with 768 additions and 194 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -116,6 +116,7 @@
 - [Aligned `Text.split` API with other methods and added `Text.lines`.][3415]
 - [Implemented a basic reader for the `Delimited` file format.][3424]
 - [Implemented a reader for the `Excel` file format.][3425]
+- [Added custom encoding support to the `Delimited` file format reader.][3430]
 
 [debug-shortcuts]:
   https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
@@ -179,6 +180,7 @@
 [3415]: https://github.com/enso-org/enso/pull/3415
 [3424]: https://github.com/enso-org/enso/pull/3424
 [3425]: https://github.com/enso-org/enso/pull/3425
+[3430]: https://github.com/enso-org/enso/pull/3430
 
 #### Enso Compiler
 

diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso
@@ -24,6 +24,7 @@ polyglot java import com.ibm.icu.lang.UCharacter
 polyglot java import com.ibm.icu.text.BreakIterator
 polyglot java import java.lang.StringBuilder
 polyglot java import org.enso.base.Text_Utils
+polyglot java import org.enso.base.Encoding_Utils
 
 ## UNSTABLE
 
@@ -751,7 +752,7 @@ Text.is_whitespace =
          "Hello".bytes (Encoding.ascii)
 Text.bytes : Encoding -> Problem_Behavior -> Vector.Vector Byte
 Text.bytes encoding on_problems=Report_Warning =
-    result = Text_Utils.get_bytes this (encoding . to_java_charset)
+    result = Encoding_Utils.get_bytes this (encoding . to_java_charset)
     vector = Vector.Vector result.result
     if result.warnings.is_nothing then vector else
         on_problems.attach_problems_after vector [Encoding_Error result.warnings]
@@ -774,7 +775,7 @@ Text.bytes encoding on_problems=Report_Warning =
          "Hello".bytes (Encoding.ascii)
 Text.from_bytes : Vector.Vector Byte -> Encoding -> Text
 Text.from_bytes bytes encoding on_problems=Report_Warning =
-    result = Text_Utils.from_bytes bytes.to_array (encoding . to_java_charset)
+    result = Encoding_Utils.from_bytes bytes.to_array (encoding . to_java_charset)
     if result.warnings.is_nothing then result.result else
         on_problems.attach_problems_after result.result [Encoding_Error result.warnings]
 

diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso
@@ -4,7 +4,7 @@ import Standard.Table
 import Standard.Base.Error.Common as Errors
 from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
 from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
-from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
+from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
 from Standard.Table.Io.File_Format import Infer
 
 polyglot java import org.enso.table.read.DelimitedReader
@@ -15,6 +15,7 @@ polyglot java import org.enso.table.read.AdditionalInvalidRows
 polyglot java import java.lang.IllegalArgumentException
 polyglot java import java.io.IOException
 polyglot java import com.univocity.parsers.common.TextParsingException
+polyglot java import org.enso.base.Encoding_Utils
 polyglot java import java.io.InputStream
 
 ## Reads a delimited file according to the provided format.
@@ -28,14 +29,13 @@ polyglot java import java.io.InputStream
      If set to `Ignore`, the operation proceeds without errors or warnings.
 read_file : Delimited -> File -> Problem_Behavior -> Any
 read_file format file on_problems =
-    if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else
-        ## We use the default `max_columns` setting. If we want to be able to
-           read files with unlimited column limits (risking OutOfMemory
-           exceptions), we can catch the exception indicating the limit has been
-           reached and restart parsing with an increased limit.
-        file.with_input_stream [File.Option.Read] stream->
-            stream.with_java_stream java_stream->
-                here.read_stream format java_stream on_problems related_file=file
+    ## We use the default `max_columns` setting. If we want to be able to
+       read files with unlimited column limits (risking OutOfMemory
+       exceptions), we can catch the exception indicating the limit has been
+       reached and restart parsing with an increased limit.
+    file.with_input_stream [File.Option.Read] stream->
+        stream.with_java_stream java_stream->
+            here.read_stream format java_stream on_problems related_file=file
 
 ## PRIVATE
    Reads an input stream according to the provided format.
@@ -70,25 +70,33 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing
         Integer -> format.row_limit
         _ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.")
     if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else
-        translate_illegal_argument caught_panic =
-            Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
-        translate_problem java_problem =
+        translate_parsing_problem java_problem =
             if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else
                 if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else
                     if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else
                         java_problem
+
+        translate_illegal_argument caught_panic =
+            Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
+        handle_illegal_arguments = Panic.catch IllegalArgumentException handler=translate_illegal_argument
+
         translate_parsing_failure caught_panic =
-            Error.throw (translate_problem caught_panic.payload.cause.problem)
+            Error.throw (translate_parsing_problem caught_panic.payload.cause.problem)
+        handle_parsing_failure = Panic.catch ParsingFailedException handler=translate_parsing_failure
+
         translate_parsing_exception caught_panic =
             cause = caught_panic.payload.cause.getCause
             if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else
                 Error.throw (Parser_Error caught_panic.payload)
+        handle_parsing_exception = Panic.catch TextParsingException handler=translate_parsing_exception
 
-        Panic.catch IllegalArgumentException handler=translate_illegal_argument <|
-            Panic.catch ParsingFailedException handler=translate_parsing_failure <|
-                Panic.catch TextParsingException handler=translate_parsing_exception <|
-                    warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
-                    reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
-                    result = Table.Table reader.read
-                    problems = Vector.Vector reader.getReportedProblems . map translate_problem
-                    on_problems.attach_problems_after result problems
+        java_charset = format.encoding.to_java_charset
+        handle_illegal_arguments <| handle_parsing_failure <| handle_parsing_exception <|
+            Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
+                warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
+                reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
+                result = Table.Table reader.read
+                decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
+                parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
+                problems = decoding_problems + parsing_problems
+                on_problems.attach_problems_after result problems
diff --git a/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java
@@ -0,0 +1,193 @@
+package org.enso.base;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;
+import java.util.function.BiConsumer;
+import java.util.function.Function;
+import java.util.function.IntFunction;
+import org.enso.base.encoding.ReportingStreamDecoder;
+import org.enso.base.text.ResultWithWarnings;
+
+public class Encoding_Utils {
+  /** The replacement character used for characters that could not have been decoded. */
+  public static final String INVALID_CHARACTER = "\uFFFD";
+
+  /**
+   * Converts a string into an array of bytes using the specified encoding.
+   *
+   * @param str the string to convert
+   * @param charset the character set to use to encode the string
+   * @return the UTF-8 representation of the string.
+   */
+  public static ResultWithWarnings<byte[]> get_bytes(String str, Charset charset) {
+    if (str.isEmpty()) {
+      return new ResultWithWarnings<>(new byte[0]);
+    }
+
+    CharsetEncoder encoder =
+        charset
+            .newEncoder()
+            .onMalformedInput(CodingErrorAction.REPORT)
+            .onUnmappableCharacter(CodingErrorAction.REPORT)
+            .reset();
+
+    CharBuffer in = CharBuffer.wrap(str.toCharArray());
+    ByteBuffer out = ByteBuffer.allocate((int) (in.remaining() * encoder.averageBytesPerChar()));
+
+    StringBuilder warnings = null;
+    while (in.hasRemaining()) {
+      CoderResult cr = encoder.encode(in, out, true);
+      if (cr.isMalformed() || cr.isUnmappable()) {
+        // Get current position for error reporting
+        int position = in.position();
+
+        if (out.remaining() < encoder.replacement().length) {
+          out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
+        }
+        out.put(encoder.replacement());
+        in.position(in.position() + cr.length());
+
+        if (warnings == null) {
+          warnings = new StringBuilder();
+          warnings.append("Encoding issues at ");
+        } else {
+          warnings.append(", ");
+        }
+        warnings.append(position);
+      } else if (cr.isUnderflow()) {
+        // Finished
+        while (encoder.flush(out) == CoderResult.OVERFLOW) {
+          out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
+        }
+        break;
+      } else if (cr.isOverflow()) {
+        out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
+      }
+    }
+
+    out.flip();
+    byte[] array = out.array();
+    if (out.limit() != array.length) {
+      array = Arrays.copyOf(array, out.limit());
+    }
+
+    if (warnings == null) {
+      return new ResultWithWarnings<>(array);
+    }
+
+    warnings.append(".");
+    return new ResultWithWarnings<>(array, warnings.toString());
+  }
+
+  /**
+   * Converts an array of encoded bytes into a string.
+   *
+   * @param bytes the bytes to convert
+   * @param charset the character set to use to decode the bytes
+   * @return the resulting string
+   */
+  public static ResultWithWarnings<String> from_bytes(byte[] bytes, Charset charset) {
+    if (bytes.length == 0) {
+      return new ResultWithWarnings<>("");
+    }
+
+    CharsetDecoder decoder =
+        charset
+            .newDecoder()
+            .onMalformedInput(CodingErrorAction.REPORT)
+            .onUnmappableCharacter(CodingErrorAction.REPORT)
+            .reset();
+
+    ByteBuffer in = ByteBuffer.wrap(bytes);
+    CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte()));
+
+    StringBuilder warnings = null;
+    while (in.hasRemaining()) {
+      CoderResult cr = decoder.decode(in, out, true);
+      if (cr.isMalformed() || cr.isUnmappable()) {
+        // Get current position for error reporting
+        int position = in.position();
+
+        if (out.remaining() < INVALID_CHARACTER.length()) {
+          out = resize(out, CharBuffer::allocate, CharBuffer::put);
+        }
+        out.put(INVALID_CHARACTER);
+        in.position(in.position() + cr.length());
+
+        if (warnings == null) {
+          warnings = new StringBuilder();
+          warnings.append("Encoding issues at ");
+        } else {
+          warnings.append(", ");
+        }
+        warnings.append(position);
+      } else if (cr.isUnderflow()) {
+        // Finished
+        while (decoder.flush(out) == CoderResult.OVERFLOW) {
+          out = resize(out, CharBuffer::allocate, CharBuffer::put);
+        }
+        break;
+      } else if (cr.isOverflow()) {
+        out = resize(out, CharBuffer::allocate, CharBuffer::put);
+      }
+    }
+
+    out.flip();
+
+    if (warnings == null) {
+      return new ResultWithWarnings<>(out.toString());
+    }
+
+    warnings.append(".");
+    return new ResultWithWarnings<>(out.toString(), warnings.toString());
+  }
+
+  /** Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset. */
+  private static ReportingStreamDecoder create_stream_decoder(InputStream stream, Charset charset) {
+    CharsetDecoder decoder =
+        charset
+            .newDecoder()
+            .onMalformedInput(CodingErrorAction.REPORT)
+            .onUnmappableCharacter(CodingErrorAction.REPORT)
+            .reset();
+    return new ReportingStreamDecoder(stream, decoder);
+  }
+
+  /**
+   * A helper function which runs an action with a created stream decoder and closes it afterwards.
+   */
+  public static <R> R with_stream_decoder(
+      InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
+      throws IOException {
+    try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
+      return action.apply(decoder);
+    }
+  }
+
+  /**
+   * A generic function to resize a buffer.
+   *
+   * @param <T> the type of the buffer to allocate
+   * @param old the buffer to resize
+   * @param allocate a function allocating a buffer of required type of a given size
+   * @param put a function which can transfer data from the old buffer into the new one
+   * @return the new buffer with increased capacity
+   */
+  public static <T extends Buffer> T resize(T old, IntFunction<T> allocate, BiConsumer<T, T> put) {
+    int n = old.capacity();
+    int new_n = (3 * n) / 2 + 1;
+    T o = allocate.apply(new_n);
+    old.flip();
+    put.accept(o, old);
+    return o;
+  }
+}