diff --git a/CHANGELOG.md b/CHANGELOG.md index 560d252014d8..49572e322ac4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -116,6 +116,7 @@ - [Aligned `Text.split` API with other methods and added `Text.lines`.][3415] - [Implemented a basic reader for the `Delimited` file format.][3424] - [Implemented a reader for the `Excel` file format.][3425] +- [Added custom encoding support to the `Delimited` file format reader.][3430] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -179,6 +180,7 @@ [3415]: https://github.com/enso-org/enso/pull/3415 [3424]: https://github.com/enso-org/enso/pull/3424 [3425]: https://github.com/enso-org/enso/pull/3425 +[3430]: https://github.com/enso-org/enso/pull/3430 #### Enso Compiler diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso index cfc66e4ad11b..bd2c07776b42 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Text/Extensions.enso @@ -24,6 +24,7 @@ polyglot java import com.ibm.icu.lang.UCharacter polyglot java import com.ibm.icu.text.BreakIterator polyglot java import java.lang.StringBuilder polyglot java import org.enso.base.Text_Utils +polyglot java import org.enso.base.Encoding_Utils ## UNSTABLE @@ -751,7 +752,7 @@ Text.is_whitespace = "Hello".bytes (Encoding.ascii) Text.bytes : Encoding -> Problem_Behavior -> Vector.Vector Byte Text.bytes encoding on_problems=Report_Warning = - result = Text_Utils.get_bytes this (encoding . to_java_charset) + result = Encoding_Utils.get_bytes this (encoding . to_java_charset) vector = Vector.Vector result.result if result.warnings.is_nothing then vector else on_problems.attach_problems_after vector [Encoding_Error result.warnings] @@ -774,7 +775,7 @@ Text.bytes encoding on_problems=Report_Warning = "Hello".bytes (Encoding.ascii) Text.from_bytes : Vector.Vector Byte -> Encoding -> Text Text.from_bytes bytes encoding on_problems=Report_Warning = - result = Text_Utils.from_bytes bytes.to_array (encoding . to_java_charset) + result = Encoding_Utils.from_bytes bytes.to_array (encoding . to_java_charset) if result.warnings.is_nothing then result.result else on_problems.attach_problems_after result.result [Encoding_Error result.warnings] diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index 1fc718e8b183..c14486786956 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -4,7 +4,7 @@ import Standard.Table import Standard.Base.Error.Common as Errors from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Table.Error as Table_Errors import Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows -from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error from Standard.Table.Io.File_Format import Infer polyglot java import org.enso.table.read.DelimitedReader @@ -15,6 +15,7 @@ polyglot java import org.enso.table.read.AdditionalInvalidRows polyglot java import java.lang.IllegalArgumentException polyglot java import java.io.IOException polyglot java import com.univocity.parsers.common.TextParsingException +polyglot java import org.enso.base.Encoding_Utils polyglot java import java.io.InputStream ## Reads a delimited file according to the provided format. @@ -28,14 +29,13 @@ polyglot java import java.io.InputStream If set to `Ignore`, the operation proceeds without errors or warnings. read_file : Delimited -> File -> Problem_Behavior -> Any read_file format file on_problems = - if format.encoding != Encoding.utf_8 then Errors.unimplemented "Custom encodings when reading Delimited files are not implemented yet." else - ## We use the default `max_columns` setting. If we want to be able to - read files with unlimited column limits (risking OutOfMemory - exceptions), we can catch the exception indicating the limit has been - reached and restart parsing with an increased limit. - file.with_input_stream [File.Option.Read] stream-> - stream.with_java_stream java_stream-> - here.read_stream format java_stream on_problems related_file=file + ## We use the default `max_columns` setting. If we want to be able to + read files with unlimited column limits (risking OutOfMemory + exceptions), we can catch the exception indicating the limit has been + reached and restart parsing with an increased limit. + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + here.read_stream format java_stream on_problems related_file=file ## PRIVATE Reads an input stream according to the provided format. @@ -70,25 +70,33 @@ read_stream format java_stream on_problems max_columns=4096 related_file=Nothing Integer -> format.row_limit _ -> Error.throw (Illegal_Argument_Error "`row_limit` should be Integer or Nothing.") if format.parse_values then Errors.unimplemented "Parsing values is not implemented yet." else - translate_illegal_argument caught_panic = - Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) - translate_problem java_problem = + translate_parsing_problem java_problem = if Java.is_instance java_problem InvalidRow then Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row) else if Java.is_instance java_problem MismatchedQuote then Mismatched_Quote else if Java.is_instance java_problem AdditionalInvalidRows then Additional_Invalid_Rows java_problem.count else java_problem + + translate_illegal_argument caught_panic = + Error.throw (Illegal_Argument_Error caught_panic.payload.cause.getMessage) + handle_illegal_arguments = Panic.catch IllegalArgumentException handler=translate_illegal_argument + translate_parsing_failure caught_panic = - Error.throw (translate_problem caught_panic.payload.cause.problem) + Error.throw (translate_parsing_problem caught_panic.payload.cause.problem) + handle_parsing_failure = Panic.catch ParsingFailedException handler=translate_parsing_failure + translate_parsing_exception caught_panic = cause = caught_panic.payload.cause.getCause if Java.is_instance cause IOException then File.wrap_io_exception related_file cause else Error.throw (Parser_Error caught_panic.payload) + handle_parsing_exception = Panic.catch TextParsingException handler=translate_parsing_exception - Panic.catch IllegalArgumentException handler=translate_illegal_argument <| - Panic.catch ParsingFailedException handler=translate_parsing_failure <| - Panic.catch TextParsingException handler=translate_parsing_exception <| - warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error - reader = DelimitedReader.new java_stream format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors - result = Table.Table reader.read - problems = Vector.Vector reader.getReportedProblems . map translate_problem - on_problems.attach_problems_after result problems + java_charset = format.encoding.to_java_charset + handle_illegal_arguments <| handle_parsing_failure <| handle_parsing_exception <| + Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> + warnings_as_errors = on_problems == Problem_Behavior_Module.Report_Error + reader = DelimitedReader.new reporting_stream_decoder format.delimiter format.quote format.quote_escape java_headers skip_rows row_limit max_columns format.keep_invalid_rows warnings_as_errors + result = Table.Table reader.read + decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error + parsing_problems = Vector.Vector reader.getReportedProblems . map translate_parsing_problem + problems = decoding_problems + parsing_problems + on_problems.attach_problems_after result problems diff --git a/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java new file mode 100644 index 000000000000..2f8e972536be --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java @@ -0,0 +1,193 @@ +package org.enso.base; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.Buffer; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CoderResult; +import java.nio.charset.CodingErrorAction; +import java.util.Arrays; +import java.util.function.BiConsumer; +import java.util.function.Function; +import java.util.function.IntFunction; +import org.enso.base.encoding.ReportingStreamDecoder; +import org.enso.base.text.ResultWithWarnings; + +public class Encoding_Utils { + /** The replacement character used for characters that could not have been decoded. */ + public static final String INVALID_CHARACTER = "\uFFFD"; + + /** + * Converts a string into an array of bytes using the specified encoding. + * + * @param str the string to convert + * @param charset the character set to use to encode the string + * @return the UTF-8 representation of the string. + */ + public static ResultWithWarnings get_bytes(String str, Charset charset) { + if (str.isEmpty()) { + return new ResultWithWarnings<>(new byte[0]); + } + + CharsetEncoder encoder = + charset + .newEncoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); + + CharBuffer in = CharBuffer.wrap(str.toCharArray()); + ByteBuffer out = ByteBuffer.allocate((int) (in.remaining() * encoder.averageBytesPerChar())); + + StringBuilder warnings = null; + while (in.hasRemaining()) { + CoderResult cr = encoder.encode(in, out, true); + if (cr.isMalformed() || cr.isUnmappable()) { + // Get current position for error reporting + int position = in.position(); + + if (out.remaining() < encoder.replacement().length) { + out = resize(out, ByteBuffer::allocate, ByteBuffer::put); + } + out.put(encoder.replacement()); + in.position(in.position() + cr.length()); + + if (warnings == null) { + warnings = new StringBuilder(); + warnings.append("Encoding issues at "); + } else { + warnings.append(", "); + } + warnings.append(position); + } else if (cr.isUnderflow()) { + // Finished + while (encoder.flush(out) == CoderResult.OVERFLOW) { + out = resize(out, ByteBuffer::allocate, ByteBuffer::put); + } + break; + } else if (cr.isOverflow()) { + out = resize(out, ByteBuffer::allocate, ByteBuffer::put); + } + } + + out.flip(); + byte[] array = out.array(); + if (out.limit() != array.length) { + array = Arrays.copyOf(array, out.limit()); + } + + if (warnings == null) { + return new ResultWithWarnings<>(array); + } + + warnings.append("."); + return new ResultWithWarnings<>(array, warnings.toString()); + } + + /** + * Converts an array of encoded bytes into a string. + * + * @param bytes the bytes to convert + * @param charset the character set to use to decode the bytes + * @return the resulting string + */ + public static ResultWithWarnings from_bytes(byte[] bytes, Charset charset) { + if (bytes.length == 0) { + return new ResultWithWarnings<>(""); + } + + CharsetDecoder decoder = + charset + .newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); + + ByteBuffer in = ByteBuffer.wrap(bytes); + CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte())); + + StringBuilder warnings = null; + while (in.hasRemaining()) { + CoderResult cr = decoder.decode(in, out, true); + if (cr.isMalformed() || cr.isUnmappable()) { + // Get current position for error reporting + int position = in.position(); + + if (out.remaining() < INVALID_CHARACTER.length()) { + out = resize(out, CharBuffer::allocate, CharBuffer::put); + } + out.put(INVALID_CHARACTER); + in.position(in.position() + cr.length()); + + if (warnings == null) { + warnings = new StringBuilder(); + warnings.append("Encoding issues at "); + } else { + warnings.append(", "); + } + warnings.append(position); + } else if (cr.isUnderflow()) { + // Finished + while (decoder.flush(out) == CoderResult.OVERFLOW) { + out = resize(out, CharBuffer::allocate, CharBuffer::put); + } + break; + } else if (cr.isOverflow()) { + out = resize(out, CharBuffer::allocate, CharBuffer::put); + } + } + + out.flip(); + + if (warnings == null) { + return new ResultWithWarnings<>(out.toString()); + } + + warnings.append("."); + return new ResultWithWarnings<>(out.toString(), warnings.toString()); + } + + /** Creates a new instance of {@code ReportingStreamDecoder} decoding a given charset. */ + private static ReportingStreamDecoder create_stream_decoder(InputStream stream, Charset charset) { + CharsetDecoder decoder = + charset + .newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT) + .reset(); + return new ReportingStreamDecoder(stream, decoder); + } + + /** + * A helper function which runs an action with a created stream decoder and closes it afterwards. + */ + public static R with_stream_decoder( + InputStream stream, Charset charset, Function action) + throws IOException { + try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) { + return action.apply(decoder); + } + } + + /** + * A generic function to resize a buffer. + * + * @param the type of the buffer to allocate + * @param old the buffer to resize + * @param allocate a function allocating a buffer of required type of a given size + * @param put a function which can transfer data from the old buffer into the new one + * @return the new buffer with increased capacity + */ + public static T resize(T old, IntFunction allocate, BiConsumer put) { + int n = old.capacity(); + int new_n = (3 * n) / 2 + 1; + T o = allocate.apply(new_n); + old.flip(); + put.accept(o, old); + return o; + } +} diff --git a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java index a153f8d75f28..e04530bac574 100644 --- a/std-bits/base/src/main/java/org/enso/base/Text_Utils.java +++ b/std-bits/base/src/main/java/org/enso/base/Text_Utils.java @@ -6,30 +6,16 @@ import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer2; import com.ibm.icu.text.StringSearch; -import java.nio.Buffer; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.Charset; -import java.nio.charset.CharsetDecoder; -import java.nio.charset.CharsetEncoder; -import java.nio.charset.CoderResult; -import java.nio.charset.CodingErrorAction; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; import java.util.Locale; -import java.util.function.BiConsumer; -import java.util.function.IntFunction; -import java.util.regex.Pattern; import org.enso.base.text.CaseFoldedString; import org.enso.base.text.CaseFoldedString.Grapheme; import org.enso.base.text.GraphemeSpan; -import org.enso.base.text.ResultWithWarnings; import org.enso.base.text.Utf16Span; /** Utils for standard library operations on Text. */ public class Text_Utils { - private static final String INVALID_CHARACTER = "\uFFFD"; /** * Creates a substring of the given string, indexing using the Java standard (UTF-16) indexing @@ -55,80 +41,6 @@ public static String drop_first(String string, int from) { return string.substring(from); } - private static T resize(T old, IntFunction allocate, BiConsumer put) { - int n = old.capacity(); - int new_n = 2 * n + 1; - T o = allocate.apply(new_n); - old.flip(); - put.accept(o, old); - return o; - } - - /** - * Converts a string into an array of bytes using the specified encoding. - * - * @param str the string to convert - * @param charset the character set to use to encode the string - * @return the UTF-8 representation of the string. - */ - public static ResultWithWarnings get_bytes(String str, Charset charset) { - if (str.isEmpty()) { - return new ResultWithWarnings<>(new byte[0]); - } - - CharsetEncoder encoder = - charset - .newEncoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) - .reset(); - - CharBuffer in = CharBuffer.wrap(str.toCharArray()); - ByteBuffer out = ByteBuffer.allocate((int) (in.remaining() * encoder.averageBytesPerChar())); - - StringBuilder warnings = null; - while (in.hasRemaining()) { - CoderResult cr = encoder.encode(in, out, true); - if (cr.isMalformed() || cr.isUnmappable()) { - // Get current position for error reporting - int position = in.position(); - - if (out.remaining() < encoder.replacement().length) { - out = resize(out, ByteBuffer::allocate, ByteBuffer::put); - } - out.put(encoder.replacement()); - in.position(in.position() + cr.length()); - - if (warnings == null) { - warnings = new StringBuilder(); - warnings.append("Encoding issues at "); - } else { - warnings.append(", "); - } - warnings.append(position); - } else if (cr.isUnderflow()) { - // Finished - encoder.flush(out); - break; - } else if (cr.isOverflow()) { - out = resize(out, ByteBuffer::allocate, ByteBuffer::put); - } - } - - out.flip(); - byte[] array = out.array(); - if (out.limit() != array.length) { - array = Arrays.copyOf(array, out.limit()); - } - - if (warnings == null) { - return new ResultWithWarnings<>(array); - } - - warnings.append("."); - return new ResultWithWarnings<>(array, warnings.toString()); - } - /** * Converts a string into an array of UTF-16 chars. * @@ -230,67 +142,6 @@ public static String from_codepoints(int[] codepoints) { return new String(codepoints, 0, codepoints.length); } - /** - * Converts an array of encoded bytes into a string. - * - * @param bytes the bytes to convert - * @param charset the character set to use to decode the bytes - * @return the resulting string - */ - public static ResultWithWarnings from_bytes(byte[] bytes, Charset charset) { - if (bytes.length == 0) { - return new ResultWithWarnings<>(""); - } - - CharsetDecoder decoder = - charset - .newDecoder() - .onMalformedInput(CodingErrorAction.REPORT) - .onUnmappableCharacter(CodingErrorAction.REPORT) - .reset(); - - ByteBuffer in = ByteBuffer.wrap(bytes); - CharBuffer out = CharBuffer.allocate((int) (bytes.length * decoder.averageCharsPerByte())); - - StringBuilder warnings = null; - while (in.hasRemaining()) { - CoderResult cr = decoder.decode(in, out, true); - if (cr.isMalformed() || cr.isUnmappable()) { - // Get current position for error reporting - int position = in.position(); - - if (out.remaining() < INVALID_CHARACTER.length()) { - out = resize(out, CharBuffer::allocate, CharBuffer::put); - } - out.put(INVALID_CHARACTER); - in.position(in.position() + cr.length()); - - if (warnings == null) { - warnings = new StringBuilder(); - warnings.append("Encoding issues at "); - } else { - warnings.append(", "); - } - warnings.append(position); - } else if (cr.isUnderflow()) { - // Finished - decoder.flush(out); - break; - } else if (cr.isOverflow()) { - out = resize(out, CharBuffer::allocate, CharBuffer::put); - } - } - - out.flip(); - - if (warnings == null) { - return new ResultWithWarnings<>(out.toString()); - } - - warnings.append("."); - return new ResultWithWarnings<>(out.toString(), warnings.toString()); - } - /** * Converts an array of UTF-16 characters into a string. * diff --git a/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamDecoder.java b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamDecoder.java new file mode 100644 index 000000000000..2917a818c121 --- /dev/null +++ b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamDecoder.java @@ -0,0 +1,316 @@ +package org.enso.base.encoding; + +import static org.enso.base.Encoding_Utils.INVALID_CHARACTER; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CoderResult; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import org.enso.base.Encoding_Utils; + +/** + * A {@code Reader} which takes an {@code InputStream} and decodes it using a provided {@code + * CharsetDecoder}. + * + *

Functionally, it should be equivalent to {@code java.io.InputStreamReader}. The major + * difference is that this class allows more granular reporting of decoding issues - instead of just + * replacing malformed characters with a replacement or failing at the first error, it allows to + * both perform the replacements but also remember the positions at which the problems occurred and + * then return a bulk report of places where the issues have been encountered. + */ +public class ReportingStreamDecoder extends Reader { + public ReportingStreamDecoder(InputStream stream, CharsetDecoder decoder) { + bufferedInputStream = new BufferedInputStream(stream); + this.decoder = decoder; + } + + private final BufferedInputStream bufferedInputStream; + private final CharsetDecoder decoder; + + /** + * The buffer keeping any characters that have already been decoded, but not consumed by the user + * yet. + * + *

Between the calls to read, it satisfies the invariant that it is in 'reading' mode. + */ + private CharBuffer outputBuffer = null; + + /** + * The buffer keeping any input that has already been read but not decoded yet. + * + *

Between the calls to read, it satisfies the invariant that it is in 'reading' mode - to be + * able to write to it, it needs to be reallocated, compacted or flipped. + */ + private ByteBuffer inputBuffer = null; + + /** + * Indicates the amount of bytes consumed before the start of the current input buffer. + * + *

The input buffer is reset many times and so its position will only indicate the bytes that + * were consumed in some current iteration, this counter allows us to compute the overall amount + * of bytes. + */ + private int inputBytesConsumedBeforeCurrentBuffer = 0; + + /** + * We re-use the work array between calls to read, to avoid re-allocating it on each call. It is + * only re-allocated if it needs to be bigger than before. + */ + private byte[] workArray = null; + + /** + * A flag that is set once the end of input has been reached. + * + *

This informs us that no more input will be available from the input stream. There may still + * be some pending characters in the output buffer. + */ + private boolean eof = false; + + /** + * Specifies if the last {@code decoder.decode} call with the {@code endOfInput = true} argument + * has been made. + */ + private boolean hadEofDecodeCall = false; + + /** + * A list of positions containing encoding issues like malformed characters. + * + *

Used for reporting warnings. + */ + List encodingIssuePositions = new ArrayList<>(); + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + int readBytes = 0; + + // First feed the pending characters that were already decoded. + if (outputBuffer != null && outputBuffer.hasRemaining()) { + int toTransfer = Math.min(len, outputBuffer.remaining()); + outputBuffer.get(cbuf, off, toTransfer); + off += toTransfer; + len -= toTransfer; + readBytes += toTransfer; + } + + // If the request is satisfied, we do not continue. + if (len <= 0) { + return readBytes; + } + + // If we reached end of file, we won't be able to read any more data from the input. We also ran + // out of cached characters, so we indicate that there is no more input. + if (eof) { + // If the previous invocation of read set the EOF flag, it must have finished the decoding + // process and flushed the decoder, so the input buffer must have been consumed in whole. + assert !inputBuffer.hasRemaining(); + return -1; + } + + // At this point we ran out of cached characters, so we will read some more input to try to get + // new characters for the request. + + prepareOutputBuffer(len); + + int expectedInputSize = Math.max((int) (len / decoder.averageCharsPerByte()), 10); + readInputStreamToInputBuffer(expectedInputSize); + runDecoderOnInputBuffer(); + + // We transfer as much as the user requested, anything that is remaining will be cached for the + // next invocation. + int toTransfer = Math.min(len, outputBuffer.remaining()); + outputBuffer.get(cbuf, off, toTransfer); + readBytes += toTransfer; + + // If we did not read any new bytes in the call that reachedn EOF, we return EOF immediately + // instead of postponing to the next call. Returning 0 at the end was causing division by zero + // in the CSV parser. + return (eof && readBytes <= 0) ? -1 : readBytes; + } + + /** + * Ensures that the output buffer is allocated and has enough space to fit as many characters as + * we expect to read. + * + *

When this method is called, the output buffer should not have any remaining cached + * characters and should be in read mode. After the method returns, the output buffer is empty and + * left in write mode. + */ + private void prepareOutputBuffer(int expectedCharactersCount) { + assert outputBuffer == null || !outputBuffer.hasRemaining(); + if (outputBuffer == null || outputBuffer.capacity() < expectedCharactersCount) { + outputBuffer = CharBuffer.allocate(expectedCharactersCount); + } else { + outputBuffer.clear(); + } + } + + /** + * Reads a chunk of data from the input stream and puts it onto the input buffer. + * + *

It updates the EOF flag if necessary. + * + *

Assumes that the input buffer is in write mode. After this function returns, the input + * buffer is in read mode. + */ + private void readInputStreamToInputBuffer(int expectedInputSize) throws IOException { + int bufferedInput = inputBuffer == null ? 0 : inputBuffer.remaining(); + // We always read at least one more byte to ensure that decoding progresses. + int bytesToRead = Math.max(expectedInputSize - bufferedInput, 1); + + ensureWorkArraySize(bytesToRead); + int bytesActuallyRead = bufferedInputStream.read(workArray, 0, bytesToRead); + if (bytesActuallyRead == -1) { + eof = true; + } + + ensureInputBufferHasEnoughFreeSpace(Math.max(0, bytesActuallyRead)); + + if (bytesActuallyRead > 0) { + inputBuffer.put(workArray, 0, bytesActuallyRead); + } + + // We flip the input buffer back to reading mode, to be able to pass it to the decoder. + inputBuffer.flip(); + } + + /** Allocates or grows the work array so that it can fit the amount of bytes we want to read. */ + private void ensureWorkArraySize(int bytesToRead) { + if (workArray == null || workArray.length < bytesToRead) { + workArray = new byte[bytesToRead]; + } + } + + /** + * Runs the decoder on the input buffer, transferring any decoded characters to the output buffer + * and growing it as needed. + * + *

Even if the input buffer does not contain any remaining data, but end-of-input has been + * encountered, one decoding step is performed to satisfy the contract of the decoder (it requires + * one final call to the decode method signifying end of input). + * + *

After this call, the output buffer is in reading mode. + */ + private void runDecoderOnInputBuffer() { + while (inputBuffer.hasRemaining() || (eof && !hadEofDecodeCall)) { + CoderResult cr = decoder.decode(inputBuffer, outputBuffer, eof); + if (eof) { + hadEofDecodeCall = true; + } + + if (cr.isMalformed() || cr.isUnmappable()) { + reportEncodingProblem(); + + if (outputBuffer.remaining() < Encoding_Utils.INVALID_CHARACTER.length()) { + growOutputBuffer(); + } + outputBuffer.put(INVALID_CHARACTER); + inputBuffer.position(inputBuffer.position() + cr.length()); + } else if (cr.isUnderflow()) { + break; + } else if (cr.isOverflow()) { + growOutputBuffer(); + } + } + + if (eof) { + flushDecoder(); + } + + // After running the decoding process, we flip the output buffer into reading mode. + outputBuffer.flip(); + } + + /** Returns the amount of bytes that have already been consumed by the decoder. */ + private int getCurrentInputPosition() { + if (inputBuffer == null) return 0; + return inputBytesConsumedBeforeCurrentBuffer + inputBuffer.position(); + } + + private void reportEncodingProblem() { + encodingIssuePositions.add(getCurrentInputPosition()); + } + + /** + * Flushes the decoder, growing the buffer as needed to ensure that any additional output from the + * decoder fits. + */ + private void flushDecoder() { + while (decoder.flush(outputBuffer) == CoderResult.OVERFLOW) { + growOutputBuffer(); + } + } + + /** + * Ensures that the input buffer has enough free space to hold the number of bytes that we want to + * read. + * + *

If necessary, the buffer is allocated or grown, preserving any existing content. + * + *

Assumes that the input buffer is in read mode when the method is called. + * + *

The buffer is in write mode after this call and has enough space to hold {@code bytesToRead} + * bytes. + */ + private void ensureInputBufferHasEnoughFreeSpace(int bytesToRead) { + if (inputBuffer == null) { + inputBuffer = ByteBuffer.allocate(bytesToRead); + } else { + int freeSpaceInInputBuffer = inputBuffer.capacity() - inputBuffer.remaining(); + + // After either compacting the buffer or reallocating it, any remaining input is shifted to + // the beginning of the buffer. Thus the bytes that preceded the current position are lost + // (because they already have been processed), so we increase the counter to keep the global + // position in the input. + inputBytesConsumedBeforeCurrentBuffer += inputBuffer.position(); + + if (freeSpaceInInputBuffer < bytesToRead) { + var old = inputBuffer; + inputBuffer = ByteBuffer.allocate(old.remaining() + bytesToRead); + inputBuffer.put(old); + } else { + inputBuffer.compact(); + } + } + + assert inputBuffer.remaining() >= bytesToRead; + } + + /** + * Increases the capacity of the output buffer, preserving its contents. + * + *

The buffer is assumed to be in write mode when entering this method and is left in write + * mode when the method returns. + */ + private void growOutputBuffer() { + outputBuffer = Encoding_Utils.resize(outputBuffer, CharBuffer::allocate, CharBuffer::put); + } + + @Override + public void close() throws IOException { + bufferedInputStream.close(); + } + + /** Returns a list of problems encountered during the decoding. */ + public List getReportedProblems() { + if (encodingIssuePositions.isEmpty()) { + return List.of(); + } else { + if (encodingIssuePositions.size() == 1) { + return List.of("Encoding issues at byte " + encodingIssuePositions.get(0) + "."); + } + + String issues = + encodingIssuePositions.stream() + .map(String::valueOf) + .collect(Collectors.joining(", ", "Encoding issues at bytes ", ".")); + return List.of(issues); + } + } +} diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 6a8f10bff344..b983c73f0ede 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -3,7 +3,7 @@ import com.univocity.parsers.csv.CsvFormat; import com.univocity.parsers.csv.CsvParser; import com.univocity.parsers.csv.CsvParserSettings; -import java.io.InputStream; +import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -52,7 +52,7 @@ public enum HeaderBehavior { /** * Creates a new reader. * - * @param inputStream the stream to read from + * @param input a reader providing decoded input characters * @param delimiter the delimiter, should be a single character, but is a String for proper * interoperability with Enso; if a string that does not fit in a single character is * provided, an exception is raised @@ -72,7 +72,7 @@ public enum HeaderBehavior { * to be discarded anyway) */ public DelimitedReader( - InputStream inputStream, + Reader input, String delimiter, String quote, String quoteEscape, @@ -132,11 +132,11 @@ public DelimitedReader( this.keepInvalidRows = keepInvalidRows; this.warningsAsErrors = warningsAsErrors; - parser = setupCsvParser(inputStream); + parser = setupCsvParser(input); } /** Creates a {@code CsvParser} according to the settings specified at construction. */ - private CsvParser setupCsvParser(InputStream inputStream) { + private CsvParser setupCsvParser(Reader input) { CsvParserSettings settings = new CsvParserSettings(); settings.setHeaderExtractionEnabled(false); CsvFormat format = new CsvFormat(); @@ -150,7 +150,7 @@ private CsvParser setupCsvParser(InputStream inputStream) { settings.setKeepQuotes(true); settings.setLineSeparatorDetectionEnabled(true); CsvParser parser = new CsvParser(settings); - parser.beginParsing(inputStream); + parser.beginParsing(input); return parser; } diff --git a/test/Table_Tests/data/transient/.gitignore b/test/Table_Tests/data/transient/.gitignore new file mode 100644 index 000000000000..afed0735dc96 --- /dev/null +++ b/test/Table_Tests/data/transient/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/test/Table_Tests/data/utf16.csv b/test/Table_Tests/data/utf16.csv new file mode 100644 index 000000000000..d15c835fe1b8 Binary files /dev/null and b/test/Table_Tests/data/utf16.csv differ diff --git a/test/Table_Tests/data/utf16_invalid.csv b/test/Table_Tests/data/utf16_invalid.csv new file mode 100644 index 000000000000..a90dac50a3ea Binary files /dev/null and b/test/Table_Tests/data/utf16_invalid.csv differ diff --git a/test/Table_Tests/data/windows.csv b/test/Table_Tests/data/windows.csv new file mode 100644 index 000000000000..2040b61dc607 --- /dev/null +++ b/test/Table_Tests/data/windows.csv @@ -0,0 +1,2 @@ +a,b,c +$�,�,� diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 1779323044ac..b0de98a4ef2a 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -1,5 +1,6 @@ from Standard.Base import all import Standard.Base.Error.Problem_Behavior +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error import Standard.Table import Standard.Table.Data.Column @@ -54,6 +55,72 @@ spec = r2 = File.read directory (File_Format.Delimited "," headers=True) Problem_Behavior.Report_Error r2.should_fail_with File.Io_Error + Test.specify "should work with all kinds of line endings" <| + path name = Enso_Project.data / 'transient' / name + create_file name ending_style = + lines = ['a,b,c', 'd,e,f', '1,2,3'] + text = lines.join ending_style + (path name).write_text text Encoding.utf_8 + + test_file name = + table = File.read (path name) (File_Format.Delimited "," headers=True) Problem_Behavior.Report_Error + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['d', '1'] + table.at 'b' . to_vector . should_equal ['e', '2'] + table.at 'c' . to_vector . should_equal ['f', '3'] + + create_file 'crlf.csv' '\r\n' + test_file 'crlf.csv' + create_file 'lf.csv' '\n' + test_file 'lf.csv' + create_file 'cr.csv' '\r' + test_file 'cr.csv' + + # Currently mixed line endings are not supported. + (path 'mixed.csv').write_text 'a,b,c\nd,e,f\r1,2,3' + File.read (path 'mixed.csv') (File_Format.Delimited "," headers=True) Problem_Behavior.Report_Error . should_fail_with Invalid_Row + + Test.specify "should work with Windows-1252 encoding" <| + table = File.read (Enso_Project.data / "windows.csv") (File_Format.Delimited "," headers=True encoding=Encoding.windows_1252) Problem_Behavior.Report_Error + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['$¢'] + table.at 'b' . to_vector . should_equal ['¤'] + table.at 'c' . to_vector . should_equal ['¥'] + + Test.specify "should work with UTF-16 encoding" <| + table = File.read (Enso_Project.data / "utf16.csv") (File_Format.Delimited "," headers=True encoding=Encoding.utf_16_be) Problem_Behavior.Report_Error + table.columns.map .name . should_equal ['ą', '🚀b', 'ć😎'] + table.at 'ą' . to_vector . should_equal ['ą'] + table.at '🚀b' . to_vector . should_equal ['✨🚀🚧😍😃😍😎😙😉☺'] + table.at 'ć😎' . to_vector . should_equal ['แมวมีสี่ขา'] + + Test.specify "should report errors when encountering malformed characters" <| + utf8_file = (Enso_Project.data / "transient" / "utf8_invalid.csv") + utf8_bytes = [97, 44, 98, 44, 99, 10, -60, -123, 44, -17, -65, -65, 44, -61, 40, -61, 40, 10] + utf8_file.write_bytes utf8_bytes + action_1 on_problems = + utf8_file.read (File_Format.Delimited "," headers=True) on_problems + tester_1 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + table.at 'a' . to_vector . should_equal ['ą'] + table.at 'b' . to_vector . should_equal ['\uFFFF'] + table.at 'c' . to_vector . should_equal ['\uFFFD(\uFFFD('] + problems_1 = [Encoding_Error "Encoding issues at bytes 13, 15."] + Problems.test_problem_handling action_1 problems_1 tester_1 + + action_2 on_problems = + (Enso_Project.data / "utf16_invalid.csv").read (File_Format.Delimited "," headers=True encoding=Encoding.utf_16_be) on_problems + tester_2 table = + table.columns.map .name . should_equal ['a', 'b', 'c'] + # This column does not raise a problem - the '\uFFFD' is simply present in the input file. + table.at 'a' . to_vector . should_equal ['\uFFFD'] + table.at 'b' . to_vector . should_equal ['\uFFFF'] + # However, this column will raise a problem as the '\uFFFD' comes from replacing an invalid codepoint. + table.at 'c' . to_vector . should_equal ['\uFFFD'] + problems_2 = [Encoding_Error "Encoding issues at byte 22."] + Problems.test_problem_handling action_2 problems_2 tester_2 + + Test.specify "should handle duplicated columns" <| table = File.read (Enso_Project.data / "duplicated_columns.csv") (File_Format.Delimited "," headers=True) table.columns.map .name . should_equal ['a', 'b', 'c', 'a_1'] @@ -81,7 +148,7 @@ spec = Test.specify "should behave correctly in presence of a mismatched quote" <| action_1 on_problems = - (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote.csv") on_problems + File.read (Enso_Project.data / "mismatched_quote.csv") (File_Format.Delimited "," headers=True) on_problems tester_1 table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -92,7 +159,7 @@ spec = Problems.test_problem_handling action_1 problems_1 tester_1 action_2 on_problems = - (File_Format.Delimited "," headers=True).read (Enso_Project.data / "mismatched_quote2.csv") on_problems + File.read (Enso_Project.data / "mismatched_quote2.csv") (File_Format.Delimited "," headers=True) on_problems tester_2 table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -104,7 +171,7 @@ spec = Test.specify "should handle too long and too short rows" <| action keep_invalid_rows on_problems = - (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows).read (Enso_Project.data / "varying_rows.csv") on_problems + File.read (Enso_Project.data / "varying_rows.csv") (File_Format.Delimited "," headers=True keep_invalid_rows=keep_invalid_rows) on_problems tester_kept table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -124,7 +191,7 @@ spec = Test.specify "should aggregate invalid rows over some limit" <| action on_problems = - (File_Format.Delimited "," headers=True keep_invalid_rows=False).read (Enso_Project.data / "many_invalid_rows.csv") on_problems + File.read (Enso_Project.data / "many_invalid_rows.csv") (File_Format.Delimited "," headers=True keep_invalid_rows=False) on_problems tester table = table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -167,9 +234,9 @@ spec = Test.specify "should check arguments" <| path = (Enso_Project.data / "simple_empty.csv") pb = Problem_Behavior.Report_Error - (File_Format.Delimited "," headers=False quote='abc').read path pb . should_fail_with Illegal_Argument_Error - (File_Format.Delimited "," headers=False quote='🚧').read path pb . should_fail_with Illegal_Argument_Error - (File_Format.Delimited "," headers=False quote_escape='//').read path pb . should_fail_with Illegal_Argument_Error - (File_Format.Delimited 'a\u{301}' headers=False).read path pb . should_fail_with Illegal_Argument_Error + path.read (File_Format.Delimited "," headers=False quote='abc') pb . should_fail_with Illegal_Argument_Error + path.read (File_Format.Delimited "," headers=False quote='🚧') pb . should_fail_with Illegal_Argument_Error + path.read (File_Format.Delimited "," headers=False quote_escape='//') pb . should_fail_with Illegal_Argument_Error + path.read (File_Format.Delimited 'a\u{301}' headers=False) pb . should_fail_with Illegal_Argument_Error main = Test.Suite.run_main here.spec diff --git a/test/Tests/data/transient/.gitignore b/test/Tests/data/transient/.gitignore new file mode 100644 index 000000000000..2211df63dd28 --- /dev/null +++ b/test/Tests/data/transient/.gitignore @@ -0,0 +1 @@ +*.txt diff --git a/test/Tests/data/subdirectory/a.txt b/test/Tests/data/tree/sample.csv similarity index 100% rename from test/Tests/data/subdirectory/a.txt rename to test/Tests/data/tree/sample.csv diff --git a/test/Tests/data/subdirectory/nested/b.txt b/test/Tests/data/tree/sample.txt similarity index 100% rename from test/Tests/data/subdirectory/nested/b.txt rename to test/Tests/data/tree/sample.txt diff --git a/test/Tests/data/tree/subdirectory/a.txt b/test/Tests/data/tree/subdirectory/a.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/Tests/data/tree/subdirectory/nested/b.txt b/test/Tests/data/tree/subdirectory/nested/b.txt new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/Tests/src/Main.enso b/test/Tests/src/Main.enso index 47f4dfbf26d4..3f929c4bd3b2 100644 --- a/test/Tests/src/Main.enso +++ b/test/Tests/src/Main.enso @@ -51,6 +51,7 @@ import project.Runtime.Stack_Traces_Spec import project.System.File_Spec import project.System.Process_Spec +import project.System.Reporting_Stream_Decoder_Spec import project.Examples_Spec @@ -63,6 +64,7 @@ main = Test.Suite.run_main <| Error_Spec.spec Examples_Spec.spec File_Spec.spec + Reporting_Stream_Decoder_Spec.spec Http_Header_Spec.spec Http_Request_Spec.spec Http_Spec.spec diff --git a/test/Tests/src/System/File_Spec.enso b/test/Tests/src/System/File_Spec.enso index 93cb92273936..545a489a4f5a 100644 --- a/test/Tests/src/System/File_Spec.enso +++ b/test/Tests/src/System/File_Spec.enso @@ -142,22 +142,25 @@ spec = Test.specify "should list files in a directory" <| immediate = Enso_Project.data.list . map .to_text - immediate.sort.should_equal (resolve ["books.json", "sample.txt", "subdirectory", "windows.txt"]) + immediate.sort.should_equal (resolve ["books.json", "sample.txt", "transient", "tree", "windows.txt"]) filtered1 = Enso_Project.data.list name_filter="s[a-cw]mple.{t?t,md}" . map .to_text filtered1.should_equal (resolve ["sample.txt"]) - filtered2 = File.list Enso_Project.data name_filter="*dir*" . map .to_text - filtered2.should_equal (resolve ["subdirectory"]) + filtered2 = File.list Enso_Project.data name_filter="*re*" . map .to_text + filtered2.should_equal (resolve ["tree"]) Test.specify "should list files in a directory recursively" <| - all = Enso_Project.data.list recursive=True . map .to_text - all.sort.should_equal (resolve ["", "books.json", "sample.txt", "subdirectory", "subdirectory/a.txt", "subdirectory/nested", "subdirectory/nested/b.txt", "windows.txt"]) + root = Enso_Project.data / "tree" + resolve files = files.map str-> (root / str) . to_text - filtered1 = Enso_Project.data.list name_filter="**.txt" recursive=True . map .to_text - filtered1.sort.should_equal (resolve ["sample.txt", "subdirectory/a.txt", "subdirectory/nested/b.txt", "windows.txt"]) + all = root.list recursive=True . map .to_text + all.sort.should_equal (resolve ["", "sample.csv", "sample.txt", "subdirectory", "subdirectory/a.txt", "subdirectory/nested", "subdirectory/nested/b.txt"]) - filtered2 = Enso_Project.data.list name_filter="*/*/*" recursive=True . map .to_text + filtered1 = root.list name_filter="**.txt" recursive=True . map .to_text + filtered1.sort.should_equal (resolve ["sample.txt", "subdirectory/a.txt", "subdirectory/nested/b.txt"]) + + filtered2 = root.list name_filter="*/*/*" recursive=True . map .to_text filtered2.should_equal (resolve ["subdirectory/nested/b.txt"]) main = Test.Suite.run_main here.spec diff --git a/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso b/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso new file mode 100644 index 000000000000..428d85ada698 --- /dev/null +++ b/test/Tests/src/System/Reporting_Stream_Decoder_Spec.enso @@ -0,0 +1,127 @@ +from Standard.Base import all + +from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error + +polyglot java import org.enso.base.Encoding_Utils +polyglot java import java.nio.CharBuffer + +import Standard.Test +import Standard.Test.Problems + +spec = + windows_file = Enso_Project.data / "windows.txt" + + read_file_one_by_one file java_charset expected_size expected_problems=[] = + file.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> + codepoints = 0.up_to expected_size . map _-> + reporting_stream_decoder.read + reporting_stream_decoder.read.should_equal -1 + + problems = Vector.Vector reporting_stream_decoder.getReportedProblems + problems.should_equal expected_problems + + Text.from_codepoints codepoints + + Test.group "ReportingStreamDecoder" <| + Test.specify "should allow reading a file character by character" <| + f = Enso_Project.data / "short.txt" + f.delete_if_exists + f.exists.should_be_false + f.write_text "Cup" + java_charset = Encoding.utf_8.to_java_charset + f.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder-> + reporting_stream_decoder.read.should_equal 67 + reporting_stream_decoder.read.should_equal 117 + reporting_stream_decoder.read.should_equal 112 + reporting_stream_decoder.read.should_equal -1 + f.delete + f.exists.should_be_false + + Test.specify "should work correctly when reading chunks of varying sizes" <| + f = Enso_Project.data / "transient" / "varying_chunks.txt" + fragment = 'Hello 😎🚀🚧!' + contents = 1.up_to 1000 . map _->fragment . join '\n' + f.write_text contents + java_charset = Encoding.utf_8.to_java_charset + + all_codepoints = Vector.new_builder + read_chars decoder n = + buffer = CharBuffer.allocate n + chars_read = decoder.read buffer + if chars_read == -1 then Nothing else + buffer.flip + v = Vector.new_builder + transfer_codepoints _ = + if buffer.hasRemaining.not then Nothing else + char = buffer.get + v.append char + all_codepoints.append char + @Tail_Call transfer_codepoints Nothing + transfer_codepoints Nothing + v.to_vector + + f.with_input_stream [File.Option.Read] stream-> + stream.with_java_stream java_stream-> + Encoding_Utils.with_stream_decoder java_stream java_charset decoder-> + read_chars decoder 1 . should_equal "H".codepoints + read_chars decoder 2 . should_equal "el".codepoints + read_chars decoder 3 . should_equal "lo ".codepoints + v1 = read_chars decoder 6 + Text.from_codepoints v1 . should_equal '😎🚀🚧' + + v2 = read_chars decoder 200 + ## Here we show that while the decoder is trying to read + 200 codepoints, some codepoints require more than one + byte in UTF-8 to represent, so the actual result + should be slightly smaller. + (v2.length < 200) . should_be_true + + ## Now we read increasingly larger amounts, to trigger + and test all paths of the input buffer resizing + mechanism. + read_chars decoder 40 + read_chars decoder 500 + read_chars decoder 1000 + read_chars decoder 1 + read_chars decoder 2 + read_chars decoder 10 + + ## Finally read all the remaining contents of the file + to verify they were decoded correctly as a whole. + read_rest _ = + case read_chars decoder 100 of + Nothing -> Nothing + _ -> @Tail_Call read_rest Nothing + read_rest Nothing + Text.from_codepoints all_codepoints.to_vector . should_equal contents + f.delete + + Test.specify "should allow reading a UTF-8 file" <| + f = Enso_Project.data / "transient" / "utf8.txt" + encoding = Encoding.utf_8 + java_charset = encoding.to_java_charset + f.write_text ((0.up_to 100).map _->'Hello World!' . join '\n') Encoding.utf_8 + expected_contents = f.read_text + contents = read_file_one_by_one f java_charset expected_contents.length + contents.should_equal expected_contents + + Test.specify "should allow reading a Windows file" <| + encoding = Encoding.windows_1252 + java_charset = encoding.to_java_charset + expected_contents = "Hello World! $¢¤¥" + contents = read_file_one_by_one windows_file java_charset expected_contents.length + contents.should_equal expected_contents + + Test.specify "should raise warnings when reading invalid characters" <| + encoding = Encoding.ascii + java_charset = encoding.to_java_charset + expected_contents = 'Hello World! $\uFFFD\uFFFD\uFFFD' + expected_problems = ["Encoding issues at bytes 14, 15, 16."] + contents = read_file_one_by_one windows_file java_charset expected_contents.length expected_problems=expected_problems + contents.should_equal expected_contents + +main = Test.Suite.run_main here.spec