Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support in Commons CSV for tracking byte positions during parsing #9

Merged
merged 6 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
<url>https://commons.apache.org/proper/commons-csv/</url>
<inceptionYear>2005</inceptionYear>
<description>The Apache Commons CSV library provides a simple interface for reading and writing CSV files of various types.</description>
<packaging>jar</packaging>

<dependencies>
<dependency>
Expand Down Expand Up @@ -231,6 +232,8 @@
<exclude>src/test/resources/org/apache/commons/csv/CSV-141/csv-141.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/csv-167/sample1.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-198/optd_por_public.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/emoji.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-196/japanese.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSV-213/999751170.patch.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/bom.csv</exclude>
<exclude>src/test/resources/org/apache/commons/csv/CSVFileParser/test.csv</exclude>
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -2074,6 +2074,30 @@ public CSVParser parse(final Reader reader) throws IOException {
return new CSVParser(reader, this);
}

/**
* Parses the specified content.
*
* <p>
* This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number,
* using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s.
* </p>
*
* <p>
* For additional parsing options, see the various static parse methods available on {@link CSVParser}.
* </p>
*
* @param reader the input stream
* @param characterOffset the character offset to start parsing from
* @param recordNumber the initial record number to start counting from
* @param encoding the character encoding of the input stream
* @return a parser over a stream of {@link CSVRecord}s.
* @throws IOException If an I/O error occurs
* @throws CSVException Thrown on invalid input.
*/
public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException {
return new CSVParser(reader, this, characterOffset, recordNumber, encoding);
}

/**
* Prints to the specified output.
*
Expand Down
34 changes: 32 additions & 2 deletions src/main/java/org/apache/commons/csv/CSVParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -438,10 +438,39 @@ public CSVParser(final Reader reader, final CSVFormat format) throws IOException
@SuppressWarnings("resource")
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber)
throws IOException {
this(reader, format, characterOffset, recordNumber, null);
}

/**
* Constructs a new instance using the given {@link CSVFormat}
*
* <p>
* If you do not read all records from the given {@code reader}, you should call {@link #close()} on the parser,
* unless you close the {@code reader}.
* </p>
*
* @param reader
* a Reader containing CSV-formatted input. Must not be null.
* @param format
* the CSVFormat used for CSV parsing. Must not be null.
* @param characterOffset
* Lexer offset when the parser does not start parsing at the beginning of the source.
* @param recordNumber
* The next record number to assign
* @param encoding
* The encoding to use for the reader
* @throws IllegalArgumentException
* If the parameters of the format are inconsistent or if either the reader or format is null.
* @throws IOException
* If there is a problem reading the header or skipping the first record
* @throws CSVException Thrown on invalid input.
*/
public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber,
String encoding) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(format, "format");
this.format = format.copy();
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader));
this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding));
this.csvRecordIterator = new CSVRecordIterator();
this.headers = createHeaders();
this.characterOffset = characterOffset;
Expand Down Expand Up @@ -768,6 +797,7 @@ CSVRecord nextRecord() throws IOException {
recordList.clear();
StringBuilder sb = null;
final long startCharPosition = lexer.getCharacterPosition() + characterOffset;
final long startCharByte = lexer.getBytesRead() + this.characterOffset;
do {
reusableToken.reset();
lexer.nextToken(reusableToken);
Expand Down Expand Up @@ -805,7 +835,7 @@ CSVRecord nextRecord() throws IOException {
recordNumber++;
final String comment = Objects.toString(sb, null);
result = new CSVRecord(this, recordList.toArray(Constants.EMPTY_STRING_ARRAY), comment,
recordNumber, startCharPosition);
recordNumber, startCharPosition, startCharByte);
}
return result;
}
Expand Down
24 changes: 24 additions & 0 deletions src/main/java/org/apache/commons/csv/CSVRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ public final class CSVRecord implements Serializable, Iterable<String> {
*/
private final long characterPosition;

/**
* The start byte of this record as a character byte in the source stream.
*/
private final long characterByte;

/** The accumulated comments (if any) */
private final String comment;

Expand All @@ -67,8 +72,18 @@ public final class CSVRecord implements Serializable, Iterable<String> {
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = 0L;
}

CSVRecord(final CSVParser parser, final String[] values, final String comment, final long recordNumber,
final long characterPosition, final long characterByte) {
this.recordNumber = recordNumber;
this.values = values != null ? values : Constants.EMPTY_STRING_ARRAY;
this.parser = parser;
this.comment = comment;
this.characterPosition = characterPosition;
this.characterByte = characterByte;
}
/**
* Returns a value by {@link Enum}.
*
Expand Down Expand Up @@ -144,6 +159,15 @@ public long getCharacterPosition() {
return characterPosition;
}

/**
* Returns the start byte of this record as a character byte in the source stream.
*
* @return the start byte of this record as a character byte in the source stream.
*/
public long getCharacterByte() {
return characterByte;
}

/**
* Returns the comment for this record, if any.
* Note that comments are attached to the following record.
Expand Down
61 changes: 61 additions & 0 deletions src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

import java.io.IOException;
import java.io.Reader;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedReader;
Expand All @@ -49,13 +53,27 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader {
private long position;
private long positionMark;

/** The number of bytes read so far */
private long bytesRead;
private long bytesReadMark;

/** Encoder used to calculate the bytes of characters */
CharsetEncoder encoder;

/**
* Constructs a new instance using the default buffer size.
*/
ExtendedBufferedReader(final Reader reader) {
super(reader);
}

ExtendedBufferedReader(final Reader reader, String encoding) {
super(reader);
if (encoding != null) {
encoder = Charset.forName(encoding).newEncoder();
}
}

/**
* Closes the stream.
*
Expand Down Expand Up @@ -108,6 +126,7 @@ public void mark(final int readAheadLimit) throws IOException {
lineNumberMark = lineNumber;
lastCharMark = lastChar;
positionMark = position;
bytesReadMark = bytesRead;
super.mark(readAheadLimit);
}

Expand All @@ -118,11 +137,43 @@ public int read() throws IOException {
current == EOF && lastChar != CR && lastChar != LF && lastChar != EOF) {
lineNumber++;
}
if (encoder != null) {
this.bytesRead += getCharBytes(current);
}
lastChar = current;
position++;
return lastChar;
}

/**
* In Java, a char data type are based on the original Unicode
* specification, which defined characters as fixed-width 16-bit entities.
* U+0000 to U+FFFF:
* - BMP, represented using 1 16-bit char
* - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars
* U+10000 to U+10FFFF:
* - Supplementary characters, represented as a pair of characters,
* the first char from the high-surrogates range (\uD800-\uDBFF),
* and the second char from the low-surrogates range (uDC00-\uDFFF).
* - Consists of UTF-8 some 3-byte chars and 4-byte chars
*/
private long getCharBytes(int current) throws CharacterCodingException {
char cChar = (char) current;
char lChar = (char) lastChar;
if (!Character.isSurrogate(cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {cChar})).limit();
} else {
if (Character.isHighSurrogate(cChar)) {
// Move on to the next char (low surrogate)
return 0;
} else if (Character.isSurrogatePair(lChar, cChar)) {
return encoder.encode(
CharBuffer.wrap(new char[] {lChar, cChar})).limit();
} else throw new CharacterCodingException();
}
}

@Override
public int read(final char[] buf, final int offset, final int length) throws IOException {
if (length == 0) {
Expand Down Expand Up @@ -187,7 +238,17 @@ public void reset() throws IOException {
lineNumber = lineNumberMark;
lastChar = lastCharMark;
position = positionMark;
bytesRead = bytesReadMark;
super.reset();
}

/**
* Gets the number of bytes read by the reader.
*
* @return the number of bytes read by the read
*/
long getBytesRead() {
return this.bytesRead;
}

}
9 changes: 9 additions & 0 deletions src/main/java/org/apache/commons/csv/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ long getCharacterPosition() {
return reader.getPosition();
}

/**
* Returns the number of bytes read
*
* @return the number of bytes read
*/
long getBytesRead() {
return reader.getBytesRead();
}

/**
* Returns the current line number
*
Expand Down
78 changes: 78 additions & 0 deletions src/test/java/org/apache/commons/csv/CSVParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,84 @@ public void testGetHeaderComment_NoComment3() throws IOException {
}
}

@Test
public void testGetRecordThreeBytesRead() throws Exception {
String code = "id,date,val5,val4\n" +
"11111111111111,'4017-09-01',きちんと節分近くには咲いてる~,v4\n" +
"22222222222222,'4017-01-01',おはよう私の友人~,v4\n" +
"33333333333333,'4017-01-01',きる自然の力ってすごいな~,v4\n";
// String code = "'1',4";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();
// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record = new CSVRecord(parser, null, null, 1L, 0L, 0L);
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 95);

assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 154);

parser.close();

}

@Test
public void testGetRecordFourBytesRead() throws Exception {
String code = "id,a,b,c\n" +
"1,😊,🤔,😂\n" +
"2,😊,🤔,😂\n" +
"3,😊,🤔,😂\n";
// final CSVFormat format = CSVFormat.newFormat(',').withQuote('\'');
final CSVFormat format = CSVFormat.Builder.create()
.setDelimiter(',')
.setQuote('\'')
.build();

// CSVParser parser = new CSVParser(new StringReader(code), format, 0L, 1L, "UTF-8");
CSVParser parser = format.parse(new StringReader(code), 0L, 1L, "UTF-8");

CSVRecord record;
assertEquals(0, parser.getRecordNumber());
assertNotNull(record = parser.nextRecord());
assertEquals(1, record.getRecordNumber());
assertEquals(code.indexOf('i'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());

assertNotNull(record = parser.nextRecord());
assertEquals(2, record.getRecordNumber());
assertEquals(code.indexOf('1'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), record.getCharacterPosition());
assertNotNull(record = parser.nextRecord());
assertEquals(3, record.getRecordNumber());
assertEquals(code.indexOf('2'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 26);
assertNotNull(record = parser.nextRecord());
assertEquals(4, record.getRecordNumber());
assertEquals(code.indexOf('3'), record.getCharacterPosition());
assertEquals(record.getCharacterByte(), 43);
parser.close();
}

@Test
public void testGetHeaderMap() throws Exception {
try (final CSVParser parser = CSVParser.parse("a,b,c\n1,2,3\nx,y,z", CSVFormat.DEFAULT.withHeader("A", "B", "C"))) {
Expand Down
Loading
Loading