From 47e6aba586b755e82879d6b13201fbbf91650a2c Mon Sep 17 00:00:00 2001 From: sebthom Date: Wed, 21 Aug 2024 11:47:58 +0200 Subject: [PATCH] refact: cleanup Char(Array|Sequence)InputStream --- .../io/stream/AbstractCharsInputStream.java | 70 +++++++++++++++++-- .../core/io/stream/CharArrayInputStream.java | 21 ++---- .../io/stream/CharSequenceInputStream.java | 25 +++---- .../io/stream/CharArrayInputStreamTest.java | 14 ++-- .../stream/CharSequenceInputStreamTest.java | 14 ++-- 5 files changed, 91 insertions(+), 53 deletions(-) diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java index 7c731039b..9c5c93f7d 100644 --- a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java +++ b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/AbstractCharsInputStream.java @@ -10,6 +10,7 @@ import java.io.InputStream; import java.nio.ByteBuffer; import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; @@ -26,13 +27,29 @@ abstract class AbstractCharsInputStream extends InputStream { protected enum EncoderState { - ENCODING, - FLUSHING, + /** + * The {@link #encoder} is actively encoding characters into bytes. This is the + * initial state of the encoder. + */ + ENCODING, // + + /** + * The {@link #encoder} has finished processing all characters and is now + * flushing any remaining bytes in its internal buffer. + */ + FLUSHING, // + + /** + * The {@link #encoder} has completed both the encoding and flushing processes. + * No more data is left to be read from the encoder. + */ DONE } + protected static final char UNICODE_REPLACEMENT_CHAR = '\uFFFD'; + /** 1024 surrogate character pairs */ - protected static final int DEFAULT_BUFFER_SIZE = 1024; + protected static final int DEFAULT_BUFFER_SIZE = 512; protected static final int CHAR_BUFFER_MULTIPLIER = 2; // 2 chars for one high/low surrogate character pair protected static final int BYTE_BUFFER_MULTIPLIER = 4; // 4 bytes for one UTF character (up to 4 bytes) @@ -75,6 +92,32 @@ protected AbstractCharsInputStream(final int bufferSize) { @Override public abstract int available(); + /** + * This method is called by {@link #refillByteBuffer()} to encode characters + * from the given {@link CharBuffer} into bytes and stores them in the + * {@link #byteBuffer}. + * + *

+ * The method can be used either to encode characters in the middle of input + * (with {@code isEndOfInput=false}) or to finalize the encoding process at the + * end of input (with {@code isEndOfInput=true}). + *

+ * + * @param in + * the {@link CharBuffer} containing characters to encode. + * @param isEndOfInput + * if {@code true}, signals that no more input will be provided, + * allowing the encoder to complete its final encoding steps. + */ + protected void encodeChars(final CharBuffer in, final boolean isEndOfInput) throws CharacterCodingException { + byteBuffer.clear(); + final CoderResult result = encoder.encode(in, byteBuffer, isEndOfInput); + byteBuffer.flip(); + if (result.isError()) { + result.throwException(); + } + } + protected boolean flushEncoder() throws IOException { if (encoderState == EncoderState.DONE) return false; @@ -88,8 +131,12 @@ protected boolean flushEncoder() throws IOException { final CoderResult result = encoder.flush(byteBuffer); byteBuffer.flip(); - if (result.isOverflow()) // byteBuffer too small + if (result.isOverflow()) { + // the byteBuffer has been filled, but there are more bytes to be flushed. + // after reading all available bytes from byteBuffer, flushEncoder() needs to + // be called again to process the remaining data. return true; + } if (result.isError()) { result.throwException(); @@ -116,7 +163,7 @@ public boolean markSupported() { @Override public int read() throws IOException { - if (!byteBuffer.hasRemaining() && !refillBuffer()) + if (!byteBuffer.hasRemaining() && !refillByteBuffer()) return IOUtils.EOF; return byteBuffer.get() & 0xFF; // next byte as an unsigned integer (0 to 255) } @@ -132,7 +179,7 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I while (bytesRead < bytesToRead) { if (bytesReadable == 0) { - if (refillBuffer()) { + if (refillByteBuffer()) { bytesReadable = byteBuffer.remaining(); } else return bytesRead == 0 ? IOUtils.EOF : bytesRead; @@ -147,7 +194,16 @@ public int read(final byte[] buf, final int off, final int bytesToRead) throws I return bytesRead; } - protected abstract boolean refillBuffer() throws IOException; + /** + * Refills the {@link #byteBuffer} by reading characters from the character + * supplier, encoding them, and storing the resulting bytes into the + * {@link #byteBuffer}. + * + * @return {@code true} if the buffer was successfully refilled and has bytes + * available for reading, {@code false} if the end of the stream is + * reached and there are no more bytes to read. + */ + protected abstract boolean refillByteBuffer() throws IOException; @Override public synchronized void reset() throws IOException { diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java index df8ab75a2..752fb3589 100644 --- a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java +++ b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharArrayInputStream.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.CoderResult; /** * @author Sebastian Thomschke @@ -46,7 +45,7 @@ public int available() { } @Override - protected boolean refillBuffer() throws IOException { + protected boolean refillByteBuffer() throws IOException { if (encoderState == EncoderState.DONE) return false; @@ -58,12 +57,7 @@ protected boolean refillBuffer() throws IOException { // if EOF is reached transition to flushing if (charIndex >= charsLen) { // finalize encoding before switching to flushing - byteBuffer.clear(); - final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */); - byteBuffer.flip(); - if (result.isError()) { - result.throwException(); - } + encodeChars(CharBuffer.allocate(0), true /* signal EOF */); return flushEncoder(); } @@ -80,11 +74,11 @@ protected boolean refillBuffer() throws IOException { charBuffer.put(lowSurrogate); } else { // missing low surrogate - fallback to replacement character - charBuffer.put('\uFFFD'); + charBuffer.put(UNICODE_REPLACEMENT_CHAR); } } else { // missing low surrogate - fallback to replacement character - charBuffer.put('\uFFFD'); + charBuffer.put(UNICODE_REPLACEMENT_CHAR); break; } } else { @@ -94,12 +88,7 @@ protected boolean refillBuffer() throws IOException { charBuffer.flip(); // encode chars into bytes - byteBuffer.clear(); - final CoderResult result = encoder.encode(charBuffer, byteBuffer, false); - byteBuffer.flip(); - if (result.isError()) { - result.throwException(); - } + encodeChars(charBuffer, false); } catch (final RuntimeException ex) { throw new IOException(ex); } diff --git a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java index ed1bb47b1..1b22f4bac 100644 --- a/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java +++ b/jstuff-core/src/main/java/net/sf/jstuff/core/io/stream/CharSequenceInputStream.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.nio.CharBuffer; import java.nio.charset.Charset; -import java.nio.charset.CoderResult; import java.util.List; import java.util.function.IntSupplier; @@ -18,6 +17,10 @@ */ public class CharSequenceInputStream extends AbstractCharsInputStream { + /** + * Functional interface for supplying characters at a specified index. + * Implementations can define how characters are fetched. + */ @FunctionalInterface public interface CharsSupplier { char charAt(int index) throws Exception; @@ -133,7 +136,7 @@ public int available() { } @Override - protected boolean refillBuffer() throws IOException { + protected boolean refillByteBuffer() throws IOException { if (encoderState == EncoderState.DONE) return false; @@ -145,12 +148,7 @@ protected boolean refillBuffer() throws IOException { // if EOF is reached transition to flushing if (charIndex >= charsLen) { // finalize encoding before switching to flushing - byteBuffer.clear(); - final CoderResult result = encoder.encode(CharBuffer.allocate(0), byteBuffer, true /* signal EOF */); - byteBuffer.flip(); - if (result.isError()) { - result.throwException(); - } + encodeChars(CharBuffer.allocate(0), true /* signal EOF */); return flushEncoder(); } @@ -167,11 +165,11 @@ protected boolean refillBuffer() throws IOException { charBuffer.put(lowSurrogate); } else { // missing low surrogate - fallback to replacement character - charBuffer.put('\uFFFD'); + charBuffer.put(UNICODE_REPLACEMENT_CHAR); } } else { // missing low surrogate - fallback to replacement character - charBuffer.put('\uFFFD'); + charBuffer.put(UNICODE_REPLACEMENT_CHAR); break; } } else { @@ -181,12 +179,7 @@ protected boolean refillBuffer() throws IOException { charBuffer.flip(); // encode chars into bytes - byteBuffer.clear(); - final CoderResult result = encoder.encode(charBuffer, byteBuffer, false); - byteBuffer.flip(); - if (result.isError()) { - result.throwException(); - } + encodeChars(charBuffer, false); } catch (final Exception ex) { throw new IOException(ex); } diff --git a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java index 701f347d9..0d7329cb8 100644 --- a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java +++ b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharArrayInputStreamTest.java @@ -29,7 +29,7 @@ public class CharArrayInputStreamTest { public void testAvailable() throws IOException { try (var is = new CharArrayInputStream(TEST_ASCII.toCharArray())) { assertThat(is.available()).isEqualTo(TEST_ASCII.length()); - final byte[] buffer = new byte[4]; + final var buffer = new byte[4]; is.read(buffer); assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4); is.readAllBytes(); @@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException { bytesRead.add((byte) b); } - final byte[] byteArray = new byte[bytesRead.size()]; + final var byteArray = new byte[bytesRead.size()]; for (int i = 0; i < bytesRead.size(); i++) { byteArray[i] = bytesRead.get(i); } @@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException { @Test public void testReadIntoByteArray() throws IOException { - final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text + final var buffer = new byte[1024]; // Buffer to read a portion of the text try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) { final int bytesRead = is.read(buffer, 0, buffer.length); @@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException { @Test public void testResetWithoutMark() throws IOException { try (var is = new CharArrayInputStream(TEST_UNICODE.toCharArray())) { - final byte[] buffer = new byte[EMOJI_BYTES_LEN]; + final var buffer = new byte[EMOJI_BYTES_LEN]; // read the first few bytes (the emoji) assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN); @@ -130,7 +130,7 @@ public void testSkip() throws IOException { final long skipped = is.skip(EMOJI_BYTES_LEN); assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN); - final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN]; + final var japanese = new byte[TEST_UNICODE_BYTES_LEN]; final int bytesRead = is.read(japanese); assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE); @@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException { final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) { final byte[] result = is.readAllBytes(); - final String output = new String(result, UTF_8); + final var output = new String(result, UTF_8); // the high surrogate at the end should be replaced by the Unicode replacement char assertThat(output).isEqualTo("A" + "\uFFFD"); @@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException { final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A' try (var is = new CharArrayInputStream(invalidSequence, UTF_8)) { final byte[] result = is.readAllBytes(); - final String output = new String(result, UTF_8); + final var output = new String(result, UTF_8); // the invalid surrogate pair should be replaced by the Unicode replacement char assertThat(output).isEqualTo("\uFFFD" + "A"); diff --git a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java index f4c89878e..2d632d96d 100644 --- a/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java +++ b/jstuff-core/src/test/java/net/sf/jstuff/core/io/stream/CharSequenceInputStreamTest.java @@ -29,7 +29,7 @@ public class CharSequenceInputStreamTest { public void testAvailable() throws IOException { try (var is = new CharSequenceInputStream(TEST_ASCII)) { assertThat(is.available()).isEqualTo(TEST_ASCII.length()); - final byte[] buffer = new byte[4]; + final var buffer = new byte[4]; is.read(buffer); assertThat(is.available()).isEqualTo(TEST_ASCII.length() - 4); is.readAllBytes(); @@ -87,7 +87,7 @@ public void testReadEachByte() throws IOException { bytesRead.add((byte) b); } - final byte[] byteArray = new byte[bytesRead.size()]; + final var byteArray = new byte[bytesRead.size()]; for (int i = 0; i < bytesRead.size(); i++) { byteArray[i] = bytesRead.get(i); } @@ -97,7 +97,7 @@ public void testReadEachByte() throws IOException { @Test public void testReadIntoByteArray() throws IOException { - final byte[] buffer = new byte[1024]; // Buffer to read a portion of the text + final var buffer = new byte[1024]; // Buffer to read a portion of the text try (var is = new CharSequenceInputStream(TEST_UNICODE)) { final int bytesRead = is.read(buffer, 0, buffer.length); @@ -110,7 +110,7 @@ public void testReadIntoByteArray() throws IOException { @Test public void testResetWithoutMark() throws IOException { try (var is = new CharSequenceInputStream(TEST_UNICODE)) { - final byte[] buffer = new byte[EMOJI_BYTES_LEN]; + final var buffer = new byte[EMOJI_BYTES_LEN]; // read the first few bytes (the emoji) assertThat(is.read(buffer)).isEqualTo(EMOJI_BYTES_LEN); @@ -130,7 +130,7 @@ public void testSkip() throws IOException { final long skipped = is.skip(EMOJI_BYTES_LEN); assertThat(skipped).isEqualTo(EMOJI_BYTES_LEN); - final byte[] japanese = new byte[TEST_UNICODE_BYTES_LEN]; + final var japanese = new byte[TEST_UNICODE_BYTES_LEN]; final int bytesRead = is.read(japanese); assertThat(new String(japanese, 0, bytesRead, UTF_8)).isEqualTo(JAPANESE); @@ -142,7 +142,7 @@ public void testHighSurrogateAtEndOfInput() throws IOException { final char[] invalidSequence = {'A', '\uD800'}; // valid char followed by an isolated high surrogate try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) { final byte[] result = is.readAllBytes(); - final String output = new String(result, UTF_8); + final var output = new String(result, UTF_8); // the high surrogate at the end should be replaced by the Unicode replacement char assertThat(output).isEqualTo("A" + "\uFFFD"); @@ -154,7 +154,7 @@ public void testHighSurrogateWithoutLowSurrogate() throws IOException { final char[] invalidSequence = {'\uD800', 'A'}; // \uD800 is a high surrogate, followed by 'A' try (var is = new CharSequenceInputStream(new String(invalidSequence), UTF_8)) { final byte[] result = is.readAllBytes(); - final String output = new String(result, UTF_8); + final var output = new String(result, UTF_8); // the invalid surrogate pair should be replaced by the Unicode replacement char assertThat(output).isEqualTo("\uFFFD" + "A");