Skip to content

Commit

Permalink
Add support in Commons CSV for tracking byte positions during parsing (
Browse files Browse the repository at this point in the history
…#9) (#10)

Add support in Commons CSV for tracking byte positions during parsing
  • Loading branch information
DarrenJAN authored Nov 7, 2024
1 parent 74f0970 commit f0a2398
Show file tree
Hide file tree
Showing 10 changed files with 315 additions and 2 deletions.
3 changes: 3 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
<packaging>jar</packaging>

<dependencies>
<dependency>
Expand Down Expand Up @@ -231,6 +232,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,30 @@ public CSVParser parse(final Reader reader) throws IOException {
return new CSVParser(reader, this);
}

/**
* Parses the specified content.
*
* <p>
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
* </p>
*
* <p>
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
* </p>
*
* @param reader the input stream
* @param characterOffset the character offset to start parsing from
* @param recordNumber the initial record number to start counting from
* @param encoding the character encoding of the input stream
* @return a parser over a stream of {@link CSVRecord}s.
* @throws IOException If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
}

/**
* Prints to the specified output.
*
Expand Down
34 changes: 32 additions & 2 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -438,10 +438,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @param encoding
* The encoding to use for the reader
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
*/
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
String encoding) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -768,6 +797,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -805,7 +835,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startCharByte);
}
return result;
}
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The start byte of this record as a character byte in the source stream.
*/
private final long characterByte;

/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = 0L;
}

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition, final long characterByte) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = characterByte;
}
/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -144,6 +159,15 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Returns the start byte of this record as a character byte in the source stream.
*
* @return the start byte of this record as a character byte in the source stream.
*/
public long getCharacterByte() {
return characterByte;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
61 changes: 61 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -49,13 +53,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far */
private long bytesRead;
private long bytesReadMark;

/** Encoder used to calculate the bytes of characters */
CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

ExtendedBufferedReader(final Reader reader, String encoding) {
super(reader);
if (encoding != null) {
encoder = Charset.forName(encoding).newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -118,11 +137,43 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getCharBytes(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* In Java, a char data type are based on the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* U+0000 to U+FFFF:
* - BMP, represented using 1 16-bit char
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
* U+10000 to U+10FFFF:
* - Supplementary characters, represented as a pair of characters,
* the first char from the high-surrogates range (\uD800-\uDBFF),
* and the second char from the low-surrogates range (uDC00-\uDFFF).
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
*/
private long getCharBytes(int current) throws CharacterCodingException {
char cChar = (char) current;
char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else throw new CharacterCodingException();
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -187,7 +238,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
9 changes: 9 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Returns the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
78 changes: 78 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}

@Test
public void testGetRecordThreeBytesRead() throws Exception {
String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
// String code = "'1',4";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 95);

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 154);

parser.close();

}

@Test
public void testGetRecordFourBytesRead() throws Exception {
String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();

// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record;
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 43);
parser.close();
}

@Test
public void testGetHeaderMap() throws Exception {
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
Expand Down
Loading

0 comments on commit f0a2398

Please sign in to comment.