Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CSV-196] Track byte position #502

Merged
merged 9 commits into from
Jan 2, 2025
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
50 changes: 47 additions & 3 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
private CSVFormat format;
private long characterOffset;
private long recordNumber = 1;
private boolean enableByteTracking;

/**
* Constructs a new instance.
Expand All @@ -164,7 +165,7 @@ protected Builder() {
@SuppressWarnings("resource")
@Override
public CSVParser get() throws IOException {
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking);
}

/**
Expand Down Expand Up @@ -200,6 +201,17 @@ public Builder setRecordNumber(final long recordNumber) {
return asThis();
}

/**
* Sets whether to enable byte tracking for the parser.
*
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
* @return this instance.
*/
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
public Builder setEnableByteTracking(final boolean enableByteTracking) {
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
this.enableByteTracking = enableByteTracking;
return asThis();
}

}

final class CSVRecordIterator implements Iterator<CSVRecord> {
Expand Down Expand Up @@ -506,11 +518,42 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@Deprecated
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null, false);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign.
* @param charset
* The character encoding to be used for the reader.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"The character encoding to be used for the reader."
->
"The character encoding to be used for the reader when enableByteTracking is true."

* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record.
* @throws CSVException Thrown on invalid input.
* @since 1.13.0.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New private elements do not need a Javadoc since tag.

*/
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
final Charset charset, final boolean enableByteTracking)
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -837,6 +880,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -874,7 +918,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startCharByte);
}
return result;
}
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The start byte of this record as a character byte in the source stream.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is confusing because it talks about a byte but the type is a long. It would be better to explain the mismatch, if that is indeed the intent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are calculating the number of bytes required to encode a single character and storing this value as a long. This choice provides a larger range for the position and ensures consistency with the design, as position is also defined as a long.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please update the comment to clarify.

*/
private final long characterByte;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This name is too confusing IMO, please rename to bytePosition to mirror the existing characterPosition.


/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = 0L;
}

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
final long characterPosition, final long characterByte) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = characterByte;
}
/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -144,6 +159,15 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Gets the start byte of this record as a character byte in the source stream
*
* @return the start byte of this record as a character byte in the source stream.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either the code or the comment is wrong. The return value is a long but the comment talks about a byte. It would help to make the comment clear as to why the return type is a long if that is indeed correct.

Copy link
Member

@garydgregory garydgregory Dec 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New public and protected elements should have a Javadoc tag of @since 1.13.0.
Please clarify the comment: Specifically document this data as "position".

*/
public long getCharacterByte() {
return characterByte;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
84 changes: 84 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -49,13 +53,36 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far. */
private long bytesRead;
private long bytesReadMark;

/** Encoder for calculating the number of bytes for each character read. */
private CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

/**
* Constructs a new instance with the specified reader, character set,
* and byte tracking option. Initializes an encoder if byte tracking is enabled
* and a character set is provided.
*
* @param reader the reader supports a look-ahead option.
* @param charset the character set for encoding, or {@code null} if not applicable.
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
*/
ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) {
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
super(reader);
if (charset != null && enableByteTracking) {
encoder = charset.newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -108,6 +135,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -118,11 +146,57 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
this.bytesRead += getCharBytes(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* Gets the byte length of the given character based on the the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* <p>
* The Unicode characters are divided into two main ranges:
* <ul>
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
* <ul>
* <li>Represented using a single 16-bit {@code char}.</li>
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
* </ul>
* </li>
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
* <ul>
* <li>Represented as a pair of {@code char}s:</li>
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
* </ul>
* </li>
* </ul>
*
* @param current the current character to process.
* @return the byte length of the character.
* @throws CharacterCodingException if the character cannot be encoded.
*/
private long getCharBytes(int current) throws CharacterCodingException {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type here should be an int because this API returns 0 or CharBuffer.limit() which itself returns an int.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a better method name here is getEncodedCharLength().

final char cChar = (char) current;
final char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else throw new CharacterCodingException();
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -187,7 +261,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
9 changes: 9 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Gets the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
71 changes: 71 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,77 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}

@Test
public void testGetRecordThreeBytesRead() throws Exception {
garydgregory marked this conversation as resolved.
Show resolved Hide resolved
final String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
garydgregory marked this conversation as resolved.
Show resolved Hide resolved

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 95);

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 154);
};

}

@Test
public void testGetRecordFourBytesRead() throws Exception {
final String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
garydgregory marked this conversation as resolved.
Show resolved Hide resolved

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 43);
}
}

@Test
public void testGetHeaderMap() throws Exception {
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
Expand Down
Loading
Loading