Merge pull request #502 from marklogic/CSV-196-master

CSV-196-TrackBytePositions
apache · Jan 2, 2025 · b40039b · b40039b
2 parents dd7b4b3 + d403084
commit b40039b
Show file tree

Hide file tree

Showing 10 changed files with 321 additions and 7 deletions.
diff --git a/pom.xml b/pom.xml
@@ -245,6 +245,8 @@
               <exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
+              <exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
               <exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>

diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -155,6 +155,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
         private CSVFormat format;
         private long characterOffset;
         private long recordNumber = 1;
+        private boolean enableByteTracking;
 
         /**
          * Constructs a new instance.
@@ -166,7 +167,7 @@ protected Builder() {
         @SuppressWarnings("resource")
         @Override
         public CSVParser get() throws IOException {
-            return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
+            return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking);
         }
 
         /**
@@ -202,6 +203,18 @@ public Builder setRecordNumber(final long recordNumber) {
             return asThis();
         }
 
+        /**
+         * Sets whether to enable byte tracking for the parser.
+         *
+         * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
+         * @return this instance.
+         * @since 1.13.0
+         */
+        public Builder setEnableByteTracking(final boolean enableByteTracking) {
+            this.enableByteTracking = enableByteTracking;
+            return asThis();
+        }
+
     }
 
     final class CSVRecordIterator implements Iterator<CSVRecord> {
@@ -510,11 +523,43 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
     @Deprecated
     @SuppressWarnings("resource")
     public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
+        throws IOException {
+            this(reader, format, characterOffset, recordNumber, null, false);
+        }
+
+    /**
+     * Constructs a new instance using the given {@link CSVFormat}
+     *
+     * <p>
+     * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+     * unless you close the {@code reader}.
+     * </p>
+     *
+     * @param reader
+     *            a Reader containing CSV-formatted input. Must not be null.
+     * @param format
+     *            the CSVFormat used for CSV parsing. Must not be null.
+     * @param characterOffset
+     *            Lexer offset when the parser does not start parsing at the beginning of the source.
+     * @param recordNumber
+     *            The next record number to assign.
+     * @param charset
+     *            The character encoding to be used for the reader when enableByteTracking is true.
+     * @param enableByteTracking
+     *           {@code true} to enable byte tracking for the parser; {@code false} to disable it.
+     * @throws IllegalArgumentException
+     *             If the parameters of the format are inconsistent or if either the reader or format is null.
+     * @throws IOException
+     *             If there is a problem reading the header or skipping the first record.
+     * @throws CSVException Thrown on invalid input.
+     */
+    private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
+        final Charset charset, final boolean enableByteTracking)
         throws IOException {
         Objects.requireNonNull(reader, "reader");
         Objects.requireNonNull(format, "format");
         this.format = format.copy();
-        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+        this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking));
         this.csvRecordIterator = new CSVRecordIterator();
         this.headers = createHeaders();
         this.characterOffset = characterOffset;
@@ -841,6 +886,7 @@ CSVRecord nextRecord() throws IOException {
         recordList.clear();
         StringBuilder sb = null;
         final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
+        final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
         do {
             reusableToken.reset();
             lexer.nextToken(reusableToken);
@@ -878,7 +924,7 @@ CSVRecord nextRecord() throws IOException {
             recordNumber++;
             final String comment = Objects.toString(sb, null);
             result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
-                recordNumber, startCharPosition);
+                recordNumber, startCharPosition, startBytePosition);
         }
         return result;
     }

diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -50,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
      */
     private final long characterPosition;
 
+    /**
+     * The starting position of this record in the source stream, measured in bytes.
+     */
+    private final long bytePosition;
+
     /** The accumulated comments (if any) */
     private final String comment;
 
@@ -62,15 +67,15 @@ public final class CSVRecord implements Serializable, Iterable<String> {
     /** The parser that originates this record. This is not serialized. */
     private final transient CSVParser parser;
 
-    CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
-            final long characterPosition) {
+    CSVRecord(final CSVParser parser, final String[] values,  final String comment, final long recordNumber,
+            final long characterPosition, final long bytePosition) {
         this.recordNumber = recordNumber;
         this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
         this.parser = parser;
         this.comment = comment;
         this.characterPosition = characterPosition;
+        this.bytePosition = bytePosition;
     }
-
     /**
      * Returns a value by {@link Enum}.
      *
@@ -146,6 +151,16 @@ public long getCharacterPosition() {
         return characterPosition;
     }
 
+    /**
+     * Returns the starting position of this record in the source stream, measured in bytes.
+     *
+     * @return the byte position of this record in the source stream.
+     * @since 1.13.0
+     */
+    public long getBytePosition() {
+        return bytePosition;
+    }
+
     /**
      * Returns the comment for this record, if any.
      * Note that comments are attached to the following record.

diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -26,6 +26,10 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -51,13 +55,36 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
     private long position;
     private long positionMark;
 
+    /** The number of bytes read so far. */
+    private long bytesRead;
+    private long bytesReadMark;
+
+    /** Encoder for calculating the number of bytes for each character read. */
+    private CharsetEncoder encoder;
+
     /**
      * Constructs a new instance using the default buffer size.
      */
     ExtendedBufferedReader(final Reader reader) {
         super(reader);
     }
 
+    /**
+     * Constructs a new instance with the specified reader, character set,
+     * and byte tracking option. Initializes an encoder if byte tracking is enabled
+     * and a character set is provided.
+     *
+     * @param reader the reader supports a look-ahead option.
+     * @param charset the character set for encoding, or {@code null} if not applicable.
+     * @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
+     */
+    ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) {
+        super(reader);
+        if (charset != null && enableByteTracking) {
+            encoder = charset.newEncoder();
+        }
+    }
+
     /**
      * Closes the stream.
      *
@@ -110,6 +137,7 @@ public void mark(final int readAheadLimit) throws IOException {
         lineNumberMark = lineNumber;
         lastCharMark = lastChar;
         positionMark = position;
+        bytesReadMark = bytesRead;
         super.mark(readAheadLimit);
     }
 
@@ -120,11 +148,59 @@ public int read() throws IOException {
             current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
             lineNumber++;
         }
+        if (encoder != null) {
+            this.bytesRead += getEncodedCharLength(current);
+        }
         lastChar = current;
         position++;
         return lastChar;
     }
 
+    /**
+     * Gets the byte length of the given character based on the the original Unicode
+     * specification, which defined characters as fixed-width 16-bit entities.
+     * <p>
+     * The Unicode characters are divided into two main ranges:
+     * <ul>
+     *   <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
+     *     <ul>
+     *       <li>Represented using a single 16-bit {@code char}.</li>
+     *       <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
+     *     </ul>
+     *   </li>
+     *   <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
+     *     <ul>
+     *       <li>Represented as a pair of {@code char}s:</li>
+     *       <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
+     *       <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
+     *       <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
+     *     </ul>
+     *   </li>
+     * </ul>
+     *
+     * @param current the current character to process.
+     * @return the byte length of the character.
+     * @throws CharacterCodingException if the character cannot be encoded.
+     */
+    private int getEncodedCharLength(int current) throws CharacterCodingException {
+        final char cChar = (char) current;
+        final char lChar = (char) lastChar;
+        if (!Character.isSurrogate(cChar)) {
+            return encoder.encode(
+                CharBuffer.wrap(new char[] {cChar})).limit();
+        } else {
+            if (Character.isHighSurrogate(cChar)) {
+                // Move on to the next char (low surrogate)
+                return 0;
+            } else if (Character.isSurrogatePair(lChar, cChar)) {
+                return encoder.encode(
+                    CharBuffer.wrap(new char[] {lChar, cChar})).limit();
+            } else {
+                throw new CharacterCodingException();
+            }
+        }
+    }
+
     @Override
     public int read(final char[] buf, final int offset, final int length) throws IOException {
         if (length == 0) {
@@ -189,7 +265,17 @@ public void reset() throws IOException {
         lineNumber = lineNumberMark;
         lastChar = lastCharMark;
         position = positionMark;
+        bytesRead = bytesReadMark;
         super.reset();
     }
 
+    /**
+     * Gets the number of bytes read by the reader.
+     *
+     * @return the number of bytes read by the read
+     */
+    long getBytesRead() {
+        return this.bytesRead;
+    }
+
 }
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -105,6 +105,15 @@ long getCharacterPosition() {
         return reader.getPosition();
     }
 
+    /**
+     * Gets the number of bytes read
+     *
+     * @return the number of bytes read
+     */
+    long getBytesRead() {
+        return reader.getBytesRead();
+    }
+
     /**
      * Returns the current line number
      *

diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -703,6 +703,76 @@ public void testGetHeaderComment_NoComment3() throws IOException {
         }
     }
 
+    @Test
+    public void testGetRecordThreeBytesRead() throws Exception {
+        final String code = "id,date,val5,val4\n" +
+            "11111111111111,'4017-09-01',きちんと節分近くには咲いてる～,v4\n" +
+            "22222222222222,'4017-01-01',おはよう私の友人～,v4\n" +
+            "33333333333333,'4017-01-01',きる自然の力ってすごいな～,v4\n";
+        final CSVFormat format = CSVFormat.Builder.create()
+            .setDelimiter(',')
+            .setQuote('\'')
+            .get();
+        try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) {
+            CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
+
+            assertEquals(0, parser.getRecordNumber());
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(1, record.getRecordNumber());
+            assertEquals(code.indexOf('i'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), record.getCharacterPosition());
+
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(2, record.getRecordNumber());
+            assertEquals(code.indexOf('1'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), record.getCharacterPosition());
+
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(3, record.getRecordNumber());
+            assertEquals(code.indexOf('2'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), 95);
+
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(4, record.getRecordNumber());
+            assertEquals(code.indexOf('3'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), 154);
+        }
+    }
+
+    @Test
+    public void testGetRecordFourBytesRead() throws Exception {
+        final String code = "id,a,b,c\n" +
+            "1,😊,🤔,😂\n" +
+            "2,😊,🤔,😂\n" +
+            "3,😊,🤔,😂\n";
+        final CSVFormat format = CSVFormat.Builder.create()
+            .setDelimiter(',')
+            .setQuote('\'')
+            .get();
+        try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) {
+            CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
+
+            assertEquals(0, parser.getRecordNumber());
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(1, record.getRecordNumber());
+            assertEquals(code.indexOf('i'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), record.getCharacterPosition());
+
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(2, record.getRecordNumber());
+            assertEquals(code.indexOf('1'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), record.getCharacterPosition());
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(3, record.getRecordNumber());
+            assertEquals(code.indexOf('2'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), 26);
+            assertNotNull(record = parser.nextRecord());
+            assertEquals(4, record.getRecordNumber());
+            assertEquals(code.indexOf('3'), record.getCharacterPosition());
+            assertEquals(record.getBytePosition(), 43);
+        }
+    }
+
     @Test
     public void testGetHeaderMap() throws Exception {
         try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {

diff --git a/src/test/java/org/apache/commons/csv/CSVRecordTest.java b/src/test/java/org/apache/commons/csv/CSVRecordTest.java
@@ -87,7 +87,7 @@ record = parser.iterator().next();
     @Test
     public void testCSVRecordNULLValues() throws IOException {
         try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) {
-            final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L);
+            final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L);
             assertEquals(0, csvRecord.size());
             assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B"));
         }