Skip to content

Commit

Permalink
Merge pull request #502 from marklogic/CSV-196-master
Browse files Browse the repository at this point in the history
CSV-196-TrackBytePositions
  • Loading branch information
garydgregory authored Jan 2, 2025
2 parents dd7b4b3 + d403084 commit b40039b
Show file tree
Hide file tree
Showing 10 changed files with 321 additions and 7 deletions.
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
52 changes: 49 additions & 3 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ public static class Builder extends AbstractStreamBuilder<CSVParser, Builder> {
private CSVFormat format;
private long characterOffset;
private long recordNumber = 1;
private boolean enableByteTracking;

/**
* Constructs a new instance.
Expand All @@ -166,7 +167,7 @@ protected Builder() {
@SuppressWarnings("resource")
@Override
public CSVParser get() throws IOException {
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber);
return new CSVParser(getReader(), format != null ? format : CSVFormat.DEFAULT, characterOffset, recordNumber, getCharset(), enableByteTracking);
}

/**
Expand Down Expand Up @@ -202,6 +203,18 @@ public Builder setRecordNumber(final long recordNumber) {
return asThis();
}

/**
* Sets whether to enable byte tracking for the parser.
*
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
* @return this instance.
* @since 1.13.0
*/
public Builder setEnableByteTracking(final boolean enableByteTracking) {
this.enableByteTracking = enableByteTracking;
return asThis();
}

}

final class CSVRecordIterator implements Iterator<CSVRecord> {
Expand Down Expand Up @@ -510,11 +523,43 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@Deprecated
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null, false);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign.
* @param charset
* The character encoding to be used for the reader when enableByteTracking is true.
* @param enableByteTracking
* {@code true} to enable byte tracking for the parser; {@code false} to disable it.
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record.
* @throws CSVException Thrown on invalid input.
*/
private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
final Charset charset, final boolean enableByteTracking)
throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset, enableByteTracking));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -841,6 +886,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startBytePosition = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -878,7 +924,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startBytePosition);
}
return result;
}
Expand Down
21 changes: 18 additions & 3 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The starting position of this record in the source stream, measured in bytes.
*/
private final long bytePosition;

/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -62,15 +67,15 @@ public final class CSVRecord implements Serializable, Iterable<String> {
/** The parser that originates this record. This is not serialized. */
private final transient CSVParser parser;

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition) {
CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition, final long bytePosition) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.bytePosition = bytePosition;
}

/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -146,6 +151,16 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Returns the starting position of this record in the source stream, measured in bytes.
*
* @return the byte position of this record in the source stream.
* @since 1.13.0
*/
public long getBytePosition() {
return bytePosition;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
86 changes: 86 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -51,13 +55,36 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far. */
private long bytesRead;
private long bytesReadMark;

/** Encoder for calculating the number of bytes for each character read. */
private CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

/**
* Constructs a new instance with the specified reader, character set,
* and byte tracking option. Initializes an encoder if byte tracking is enabled
* and a character set is provided.
*
* @param reader the reader supports a look-ahead option.
* @param charset the character set for encoding, or {@code null} if not applicable.
* @param enableByteTracking {@code true} to enable byte tracking; {@code false} to disable it.
*/
ExtendedBufferedReader(final Reader reader, Charset charset, boolean enableByteTracking) {
super(reader);
if (charset != null && enableByteTracking) {
encoder = charset.newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -110,6 +137,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -120,11 +148,59 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getEncodedCharLength(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* Gets the byte length of the given character based on the the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* <p>
* The Unicode characters are divided into two main ranges:
* <ul>
* <li><b>U+0000 to U+FFFF (Basic Multilingual Plane, BMP):</b>
* <ul>
* <li>Represented using a single 16-bit {@code char}.</li>
* <li>Includes UTF-8 encodings of 1-byte, 2-byte, and some 3-byte characters.</li>
* </ul>
* </li>
* <li><b>U+10000 to U+10FFFF (Supplementary Characters):</b>
* <ul>
* <li>Represented as a pair of {@code char}s:</li>
* <li>The first {@code char} is from the high-surrogates range (\uD800-\uDBFF).</li>
* <li>The second {@code char} is from the low-surrogates range (\uDC00-\uDFFF).</li>
* <li>Includes UTF-8 encodings of some 3-byte characters and all 4-byte characters.</li>
* </ul>
* </li>
* </ul>
*
* @param current the current character to process.
* @return the byte length of the character.
* @throws CharacterCodingException if the character cannot be encoded.
*/
private int getEncodedCharLength(int current) throws CharacterCodingException {
final char cChar = (char) current;
final char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else {
throw new CharacterCodingException();
}
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -189,7 +265,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
9 changes: 9 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Gets the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
70 changes: 70 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,76 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}

@Test
public void testGetRecordThreeBytesRead() throws Exception {
final String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get() ) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 95);

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 154);
}
}

@Test
public void testGetRecordFourBytesRead() throws Exception {
final String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.get();
try (CSVParser parser = CSVParser.builder().setReader(new StringReader(code)).setFormat(format).setCharset(UTF_8).setEnableByteTracking(true).get()) {
CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);

assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getBytePosition(), 43);
}
}

@Test
public void testGetHeaderMap() throws Exception {
try (CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/apache/commons/csv/CSVRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ record = parser.iterator().next();
@Test
public void testCSVRecordNULLValues() throws IOException {
try (CSVParser parser = CSVParser.parse("A,B\r\nONE,TWO", CSVFormat.DEFAULT.withHeader())) {
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L);
final CSVRecord csvRecord = new CSVRecord(parser, null, null, 0L, 0L, 0L);
assertEquals(0, csvRecord.size());
assertThrows(IllegalArgumentException.class, () -> csvRecord.get("B"));
}
Expand Down
Loading

0 comments on commit b40039b

Please sign in to comment.