@@ -231,6 +232,8 @@
src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv
src/test/resources/org/apache/commons/csv/csv-167/sample1.csv
src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv
+ src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
+ src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv
src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv
src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv
diff --git a/src/main/java/org/apache/commons/csv/CSVFormat.java b/src/main/java/org/apache/commons/csv/CSVFormat.java
index dd5416e11d..7738961837 100644
--- a/src/main/java/org/apache/commons/csv/CSVFormat.java
+++ b/src/main/java/org/apache/commons/csv/CSVFormat.java
@@ -2074,6 +2074,30 @@ public CSVParser parse(final Reader reader) throws IOException {
return new CSVParser(reader, this);
}
+ /**
+ * Parses the specified content.
+ *
+ *
+ * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
+ * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
+ *
+ *
+ *
+ * For additional parsing options, see the various static parse methods available on {@link CSVParser}.
+ *
+ *
+ * @param reader the input stream
+ * @param characterOffset the character offset to start parsing from
+ * @param recordNumber the initial record number to start counting from
+ * @param encoding the character encoding of the input stream
+ * @return a parser over a stream of {@link CSVRecord}s.
+ * @throws IOException If an I/O error occurs
+ * @throws CSVException Thrown on invalid input.
+ */
+ public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
+ return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
+ }
+
/**
* Prints to the specified output.
*
diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java
index a2bc230706..761599a397 100644
--- a/src/main/java/org/apache/commons/csv/CSVParser.java
+++ b/src/main/java/org/apache/commons/csv/CSVParser.java
@@ -438,10 +438,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
+ this(reader, format, characterOffset, recordNumber, null);
+ }
+
+ /**
+ * Constructs a new instance using the given {@link CSVFormat}
+ *
+ *
+ * If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
+ * unless you close the {@code reader}.
+ *
+ *
+ * @param reader
+ * a Reader containing CSV-formatted input. Must not be null.
+ * @param format
+ * the CSVFormat used for CSV parsing. Must not be null.
+ * @param characterOffset
+ * Lexer offset when the parser does not start parsing at the beginning of the source.
+ * @param recordNumber
+ * The next record number to assign
+ * @param encoding
+ * The encoding to use for the reader
+ * @throws IllegalArgumentException
+ * If the parameters of the format are inconsistent or if either the reader or format is null.
+ * @throws IOException
+ * If there is a problem reading the header or skipping the first record
+ * @throws CSVException Thrown on invalid input.
+ */
+ public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
+ String encoding) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
- this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
+ this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
@@ -768,6 +797,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
+ final long startCharByte = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
@@ -805,7 +835,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
- recordNumber, startCharPosition);
+ recordNumber, startCharPosition, startCharByte);
}
return result;
}
diff --git a/src/main/java/org/apache/commons/csv/CSVRecord.java b/src/main/java/org/apache/commons/csv/CSVRecord.java
index 1fac65843d..f0a0a6b816 100644
--- a/src/main/java/org/apache/commons/csv/CSVRecord.java
+++ b/src/main/java/org/apache/commons/csv/CSVRecord.java
@@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable {
*/
private final long characterPosition;
+ /**
+ * The start byte of this record as a character byte in the source stream.
+ */
+ private final long characterByte;
+
/** The accumulated comments (if any) */
private final String comment;
@@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable {
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
+ this.characterByte = 0L;
}
+ CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
+ final long characterPosition, final long characterByte) {
+ this.recordNumber = recordNumber;
+ this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
+ this.parser = parser;
+ this.comment = comment;
+ this.characterPosition = characterPosition;
+ this.characterByte = characterByte;
+ }
/**
* Returns a value by {@link Enum}.
*
@@ -144,6 +159,15 @@ public long getCharacterPosition() {
return characterPosition;
}
+ /**
+ * Returns the start byte of this record as a character byte in the source stream.
+ *
+ * @return the start byte of this record as a character byte in the source stream.
+ */
+ public long getCharacterByte() {
+ return characterByte;
+ }
+
/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
index 18c922a508..2a82d48a5a 100644
--- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
+++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
@@ -24,6 +24,10 @@
import java.io.IOException;
import java.io.Reader;
+import java.nio.CharBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetEncoder;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
@@ -49,6 +53,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;
+ /** The number of bytes read so far */
+ private long bytesRead;
+ private long bytesReadMark;
+
+ /** Encoder used to calculate the bytes of characters */
+ CharsetEncoder encoder;
+
/**
* Constructs a new instance using the default buffer size.
*/
@@ -56,6 +67,13 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
super(reader);
}
+ ExtendedBufferedReader(final Reader reader, String encoding) {
+ super(reader);
+ if (encoding != null) {
+ encoder = Charset.forName(encoding).newEncoder();
+ }
+ }
+
/**
* Closes the stream.
*
@@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
+ bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}
@@ -118,11 +137,43 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
+ if (encoder != null) {
+ this.bytesRead += getCharBytes(current);
+ }
lastChar = current;
position++;
return lastChar;
}
+ /**
+ * In Java, a char data type are based on the original Unicode
+ * specification, which defined characters as fixed-width 16-bit entities.
+ * U+0000 to U+FFFF:
+ * - BMP, represented using 1 16-bit char
+ * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
+ * U+10000 to U+10FFFF:
+ * - Supplementary characters, represented as a pair of characters,
+ * the first char from the high-surrogates range (\uD800-\uDBFF),
+ * and the second char from the low-surrogates range (uDC00-\uDFFF).
+ * - Consists of UTF-8 some 3-byte chars and 4-byte chars
+ */
+ private long getCharBytes(int current) throws CharacterCodingException {
+ char cChar = (char) current;
+ char lChar = (char) lastChar;
+ if (!Character.isSurrogate(cChar)) {
+ return encoder.encode(
+ CharBuffer.wrap(new char[] {cChar})).limit();
+ } else {
+ if (Character.isHighSurrogate(cChar)) {
+ // Move on to the next char (low surrogate)
+ return 0;
+ } else if (Character.isSurrogatePair(lChar, cChar)) {
+ return encoder.encode(
+ CharBuffer.wrap(new char[] {lChar, cChar})).limit();
+ } else throw new CharacterCodingException();
+ }
+ }
+
@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
@@ -187,7 +238,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
+ bytesRead = bytesReadMark;
super.reset();
}
+ /**
+ * Gets the number of bytes read by the reader.
+ *
+ * @return the number of bytes read by the read
+ */
+ long getBytesRead() {
+ return this.bytesRead;
+ }
+
}
diff --git a/src/main/java/org/apache/commons/csv/Lexer.java b/src/main/java/org/apache/commons/csv/Lexer.java
index 6d9c8a4850..afbba4d21d 100644
--- a/src/main/java/org/apache/commons/csv/Lexer.java
+++ b/src/main/java/org/apache/commons/csv/Lexer.java
@@ -103,6 +103,15 @@ long getCharacterPosition() {
return reader.getPosition();
}
+ /**
+ * Returns the number of bytes read
+ *
+ * @return the number of bytes read
+ */
+ long getBytesRead() {
+ return reader.getBytesRead();
+ }
+
/**
* Returns the current line number
*
diff --git a/src/test/java/org/apache/commons/csv/CSVParserTest.java b/src/test/java/org/apache/commons/csv/CSVParserTest.java
index 6a0637301d..f871308e8f 100644
--- a/src/test/java/org/apache/commons/csv/CSVParserTest.java
+++ b/src/test/java/org/apache/commons/csv/CSVParserTest.java
@@ -693,6 +693,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}
+ @Test
+ public void testGetRecordThreeBytesRead() throws Exception {
+ String code = "id,date,val5,val4\n" +
+ "11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
+ "22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
+ "33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
+ // String code = "'1',4";
+ // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+ final CSVFormat format = CSVFormat.Builder.create()
+ .setDelimiter(',')
+ .setQuote('\'')
+ .build();
+ // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+ CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+ CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
+ assertEquals(0, parser.getRecordNumber());
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(1, record.getRecordNumber());
+ assertEquals(code.indexOf('i'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(2, record.getRecordNumber());
+ assertEquals(code.indexOf('1'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(3, record.getRecordNumber());
+ assertEquals(code.indexOf('2'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), 95);
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(4, record.getRecordNumber());
+ assertEquals(code.indexOf('3'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), 154);
+
+ parser.close();
+
+ }
+
+ @Test
+ public void testGetRecordFourBytesRead() throws Exception {
+ String code = "id,a,b,c\n" +
+ "1,😊,🤔,😂\n" +
+ "2,😊,🤔,😂\n" +
+ "3,😊,🤔,😂\n";
+ // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+ final CSVFormat format = CSVFormat.Builder.create()
+ .setDelimiter(',')
+ .setQuote('\'')
+ .build();
+
+ // CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
+ CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");
+
+ CSVRecord record;
+ assertEquals(0, parser.getRecordNumber());
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(1, record.getRecordNumber());
+ assertEquals(code.indexOf('i'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(2, record.getRecordNumber());
+ assertEquals(code.indexOf('1'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), record.getCharacterPosition());
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(3, record.getRecordNumber());
+ assertEquals(code.indexOf('2'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), 26);
+ assertNotNull(record = parser.nextRecord());
+ assertEquals(4, record.getRecordNumber());
+ assertEquals(code.indexOf('3'), record.getCharacterPosition());
+ assertEquals(record.getCharacterByte(), 43);
+ parser.close();
+ }
+
@Test
public void testGetHeaderMap() throws Exception {
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
diff --git a/src/test/java/org/apache/commons/csv/JiraCsv196Test.java b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
new file mode 100644
index 0000000000..7dbc23cafa
--- /dev/null
+++ b/src/test/java/org/apache/commons/csv/JiraCsv196Test.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.csv;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+import org.junit.jupiter.api.Test;
+
+
+public class JiraCsv196Test {
+ @Test
+ public void parseThreeBytes() throws IOException {
+
+ // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+ final CSVFormat format = CSVFormat.Builder.create()
+ .setDelimiter(',')
+ .setQuote('\'')
+ .build();
+ // CSVParser parser = new CSVParser(getTestInput(
+ // "org/apache/commons/csv/CSV-196/japanese.csv"), format, 0L, 1L, "UTF-8");
+ CSVParser parser = format.parse(getTestInput(
+ "org/apache/commons/csv/CSV-196/japanese.csv"), 0L, 1L, "UTF-8");
+ long[] charByteKey = {0, 89, 242, 395};
+ int idx = 0;
+ for (CSVRecord record : parser) {
+ assertEquals(charByteKey[idx++], record.getCharacterByte());
+ }
+ parser.close();
+ }
+
+
+ @Test
+ public void parseFourBytes() throws IOException {
+ // final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
+ final CSVFormat format = CSVFormat.Builder.create()
+ .setDelimiter(',')
+ .setQuote('\'')
+ .build();
+
+ CSVParser parser = format.parse(getTestInput(
+ "org/apache/commons/csv/CSV-196/emoji.csv"), 0L, 1L, "UTF-8");
+
+ long[] charByteKey = {0, 84, 701, 1318, 1935};
+ int idx = 0;
+ for (CSVRecord record : parser) {
+ assertEquals(charByteKey[idx++], record.getCharacterByte());
+ }
+ parser.close();
+ }
+
+
+ private Reader getTestInput(String path) {
+ return new InputStreamReader(
+ ClassLoader.getSystemClassLoader().getResourceAsStream(path));
+ }
+}
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
new file mode 100644
index 0000000000..0bff7a44f3
--- /dev/null
+++ b/src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv
@@ -0,0 +1,5 @@
+id,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15
+1,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+2,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+3,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
+4,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄,😄😄😄😄😄😄😄😄😄😄
\ No newline at end of file
diff --git a/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
new file mode 100644
index 0000000000..b06e04bd6a
--- /dev/null
+++ b/src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv
@@ -0,0 +1,4 @@
+id,date,val1,val2,val3,val4,val5,val6,val7,val8,val9,val10,val11,val12,val13,val14,val15
+00000000000001,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
+00000000000002,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
+00000000000003,2017-01-01,きちんと節分近くには咲いてる。自然の力ってすごいな~,v2,v3,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15
\ No newline at end of file