Skip to content

Commit

Permalink
Delimited File Encoding (#3430)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored May 10, 2022
1 parent 0e904b2 commit 64f178f
Show file tree
Hide file tree
Showing 20 changed files with 768 additions and 194 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@
- [Aligned `Text.split` API with other methods and added `Text.lines`.][3415]
- [Implemented a basic reader for the `Delimited` file format.][3424]
- [Implemented a reader for the `Excel` file format.][3425]
- [Added custom encoding support to the `Delimited` file format reader.][3430]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -179,6 +180,7 @@
[3415]: https://github.com/enso-org/enso/pull/3415
[3424]: https://github.com/enso-org/enso/pull/3424
[3425]: https://github.com/enso-org/enso/pull/3425
[3430]: https://github.com/enso-org/enso/pull/3430

#### Enso Compiler

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ polyglot java import com.ibm.icu.lang.UCharacter
polyglot java import com.ibm.icu.text.BreakIterator
polyglot java import java.lang.StringBuilder
polyglot java import org.enso.base.Text_Utils
polyglot java import org.enso.base.Encoding_Utils

## UNSTABLE

Expand Down Expand Up @@ -751,7 +752,7 @@ Text.is_whitespace =
"Hello".bytes (Encoding.ascii)
Text.bytes : Encoding -> Problem_Behavior -> Vector.Vector Byte
Text.bytes encoding on_problems=Report_Warning =
result = Text_Utils.get_bytes this (encoding . to_java_charset)
result = Encoding_Utils.get_bytes this (encoding . to_java_charset)
vector = Vector.Vector result.result
if result.warnings.is_nothing then vector else
on_problems.attach_problems_after vector [Encoding_Error result.warnings]
Expand All @@ -774,7 +775,7 @@ Text.bytes encoding on_problems=Report_Warning =
"Hello".bytes (Encoding.ascii)
Text.from_bytes : Vector.Vector Byte -> Encoding -> Text
Text.from_bytes bytes encoding on_problems=Report_Warning =
result = Text_Utils.from_bytes bytes.to_array (encoding . to_java_charset)
result = Encoding_Utils.from_bytes bytes.to_array (encoding . to_java_charset)
if result.warnings.is_nothing then result.result else
on_problems.attach_problems_after result.result [Encoding_Error result.warnings]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import Standard.Table
import Standard.Base.Error.Common as Errors
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior
from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
from Standard.Table.Io.File_Format import Infer

polyglot java import org.enso.table.read.DelimitedReader
Expand All @@ -15,6 +15,7 @@ polyglot java import org.enso.table.read.AdditionalInvalidRows
polyglot java import java.lang.IllegalArgumentException
polyglot java import java.io.IOException
polyglot java import com.univocity.parsers.common.TextParsingException
polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.io.InputStream

## Reads a delimited file according to the provided format.
Expand All @@ -28,14 +29,13 @@ polyglot java import java.io.InputStream
If set to `Ignore`, the operation proceeds without errors or warnings.
read_file : Delimited -> File -> Problem_Behavior -> Any
read_file format file on_problems =
if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else
## We use the default `max_columns` setting. If we want to be able to
read files with unlimited column limits (risking OutOfMemory
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
here.read_stream format java_stream on_problems related_file=file
## We use the default `max_columns` setting. If we want to be able to
read files with unlimited column limits (risking OutOfMemory
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
here.read_stream format java_stream on_problems related_file=file

## PRIVATE
Reads an input stream according to the provided format.
Expand Down Expand Up @@ -70,25 +70,33 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing
Integer -> format.row_limit
_ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.")
if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else
translate_illegal_argument caught_panic =
Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
translate_problem java_problem =
translate_parsing_problem java_problem =
if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else
if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else
if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else
java_problem

translate_illegal_argument caught_panic =
Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage)
handle_illegal_arguments = Panic.catch IllegalArgumentException handler=translate_illegal_argument

translate_parsing_failure caught_panic =
Error.throw (translate_problem caught_panic.payload.cause.problem)
Error.throw (translate_parsing_problem caught_panic.payload.cause.problem)
handle_parsing_failure = Panic.catch ParsingFailedException handler=translate_parsing_failure

translate_parsing_exception caught_panic =
cause = caught_panic.payload.cause.getCause
if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else
Error.throw (Parser_Error caught_panic.payload)
handle_parsing_exception = Panic.catch TextParsingException handler=translate_parsing_exception

Panic.catch IllegalArgumentException handler=translate_illegal_argument <|
Panic.catch ParsingFailedException handler=translate_parsing_failure <|
Panic.catch TextParsingException handler=translate_parsing_exception <|
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
problems = Vector.Vector reader.getReportedProblems . map translate_problem
on_problems.attach_problems_after result problems
java_charset = format.encoding.to_java_charset
handle_illegal_arguments <| handle_parsing_failure <| handle_parsing_exception <|
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error
reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors
result = Table.Table reader.read
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem
problems = decoding_problems + parsing_problems
on_problems.attach_problems_after result problems
193 changes: 193 additions & 0 deletions std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
package org.enso.base;

import java.io.IOException;
import java.io.InputStream;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.IntFunction;
import org.enso.base.encoding.ReportingStreamDecoder;
import org.enso.base.text.ResultWithWarnings;

public class Encoding_Utils {
/** The replacement character used for characters that could not have been decoded. */
public static final String INVALID_CHARACTER = "\uFFFD";

/**
* Converts a string into an array of bytes using the specified encoding.
*
* @param str the string to convert
* @param charset the character set to use to encode the string
* @return the UTF-8 representation of the string.
*/
public static ResultWithWarnings<byte[]> get_bytes(String str, Charset charset) {
if (str.isEmpty()) {
return new ResultWithWarnings<>(new byte[0]);
}

CharsetEncoder encoder =
charset
.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.reset();

CharBuffer in = CharBuffer.wrap(str.toCharArray());
ByteBuffer out = ByteBuffer.allocate((int) (in.remaining() * encoder.averageBytesPerChar()));

StringBuilder warnings = null;
while (in.hasRemaining()) {
CoderResult cr = encoder.encode(in, out, true);
if (cr.isMalformed() || cr.isUnmappable()) {
// Get current position for error reporting
int position = in.position();

if (out.remaining() < encoder.replacement().length) {
out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
}
out.put(encoder.replacement());
in.position(in.position() + cr.length());

if (warnings == null) {
warnings = new StringBuilder();
warnings.append("Encoding issues at ");
} else {
warnings.append(", ");
}
warnings.append(position);
} else if (cr.isUnderflow()) {
// Finished
while (encoder.flush(out) == CoderResult.OVERFLOW) {
out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
}
break;
} else if (cr.isOverflow()) {
out = resize(out, ByteBuffer::allocate, ByteBuffer::put);
}
}

out.flip();
byte[] array = out.array();
if (out.limit() != array.length) {
array = Arrays.copyOf(array, out.limit());
}

if (warnings == null) {
return new ResultWithWarnings<>(array);
}

warnings.append(".");
return new ResultWithWarnings<>(array, warnings.toString());
}

/**
* Converts an array of encoded bytes into a string.
*
* @param bytes the bytes to convert
* @param charset the character set to use to decode the bytes
* @return the resulting string
*/
public static ResultWithWarnings<String> from_bytes(byte[] bytes, Charset charset) {
if (bytes.length == 0) {
return new ResultWithWarnings<>("");
}

CharsetDecoder decoder =
charset
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.reset();

ByteBuffer in = ByteBuffer.wrap(bytes);
CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte()));

StringBuilder warnings = null;
while (in.hasRemaining()) {
CoderResult cr = decoder.decode(in, out, true);
if (cr.isMalformed() || cr.isUnmappable()) {
// Get current position for error reporting
int position = in.position();

if (out.remaining() < INVALID_CHARACTER.length()) {
out = resize(out, CharBuffer::allocate, CharBuffer::put);
}
out.put(INVALID_CHARACTER);
in.position(in.position() + cr.length());

if (warnings == null) {
warnings = new StringBuilder();
warnings.append("Encoding issues at ");
} else {
warnings.append(", ");
}
warnings.append(position);
} else if (cr.isUnderflow()) {
// Finished
while (decoder.flush(out) == CoderResult.OVERFLOW) {
out = resize(out, CharBuffer::allocate, CharBuffer::put);
}
break;
} else if (cr.isOverflow()) {
out = resize(out, CharBuffer::allocate, CharBuffer::put);
}
}

out.flip();

if (warnings == null) {
return new ResultWithWarnings<>(out.toString());
}

warnings.append(".");
return new ResultWithWarnings<>(out.toString(), warnings.toString());
}

/** Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset. */
private static ReportingStreamDecoder create_stream_decoder(InputStream stream, Charset charset) {
CharsetDecoder decoder =
charset
.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.reset();
return new ReportingStreamDecoder(stream, decoder);
}

/**
* A helper function which runs an action with a created stream decoder and closes it afterwards.
*/
public static <R> R with_stream_decoder(
InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
throws IOException {
try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
return action.apply(decoder);
}
}

/**
* A generic function to resize a buffer.
*
* @param <T> the type of the buffer to allocate
* @param old the buffer to resize
* @param allocate a function allocating a buffer of required type of a given size
* @param put a function which can transfer data from the old buffer into the new one
* @return the new buffer with increased capacity
*/
public static <T extends Buffer> T resize(T old, IntFunction<T> allocate, BiConsumer<T, T> put) {
int n = old.capacity();
int new_n = (3 * n) / 2 + 1;
T o = allocate.apply(new_n);
old.flip();
put.accept(o, old);
return o;
}
}
Loading

0 comments on commit 64f178f

Please sign in to comment.