- * This method provides a way to parse CSV data from an input stream, starting at a specified character offset and record number, - * using a specified encoding. It returns a {@link CSVParser} that can be used to iterate over the parsed {@link CSVRecord}s. - *
- * - *- * For additional parsing options, see the various static parse methods available on {@link CSVParser}. - *
- * - * @param reader the input stream - * @param characterOffset the character offset to start parsing from - * @param recordNumber the initial record number to start counting from - * @param encoding the character encoding of the input stream - * @return a parser over a stream of {@link CSVRecord}s. - * @throws IOException If an I/O error occurs - * @throws CSVException Thrown on invalid input. - */ - public CSVParser parse(final Reader reader, final long characterOffset, final long recordNumber, String encoding) throws IOException { - return new CSVParser(reader, this, characterOffset, recordNumber, encoding); - } - /** * Prints to the specified output. * diff --git a/src/main/java/org/apache/commons/csv/CSVParser.java b/src/main/java/org/apache/commons/csv/CSVParser.java index c48e1da096..024dd562d4 100644 --- a/src/main/java/org/apache/commons/csv/CSVParser.java +++ b/src/main/java/org/apache/commons/csv/CSVParser.java @@ -153,6 +153,7 @@ public static class Builder extends AbstractStreamBuilder@@ -525,21 +536,22 @@ public CSVParser(final Reader reader, final CSVFormat format, final long charact * @param characterOffset * Lexer offset when the parser does not start parsing at the beginning of the source. * @param recordNumber - * The next record number to assign - * @param encoding - * The encoding to use for the reader + * The next record number to assign. + * @param charset + * The character encoding to be used for the reader. * @throws IllegalArgumentException * If the parameters of the format are inconsistent or if either the reader or format is null. * @throws IOException - * If there is a problem reading the header or skipping the first record + * If there is a problem reading the header or skipping the first record. * @throws CSVException Thrown on invalid input. + * @since 1.13.0. */ - public CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, - String encoding) throws IOException { + private CSVParser(final Reader reader, final CSVFormat format, final long characterOffset, final long recordNumber, final Charset charset) + throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(format, "format"); this.format = format.copy(); - this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, encoding)); + this.lexer = new Lexer(format, new ExtendedBufferedReader(reader, charset)); this.csvRecordIterator = new CSVRecordIterator(); this.headers = createHeaders(); this.characterOffset = characterOffset; diff --git a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java index 2a82d48a5a..158f90a755 100644 --- a/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java +++ b/src/main/java/org/apache/commons/csv/ExtendedBufferedReader.java @@ -53,12 +53,12 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { private long position; private long positionMark; - /** The number of bytes read so far */ + /** The number of bytes read so far. */ private long bytesRead; private long bytesReadMark; - /** Encoder used to calculate the bytes of characters */ - CharsetEncoder encoder; + /** Encoder for calculating the number of bytes for each character read. */ + private CharsetEncoder encoder; /** * Constructs a new instance using the default buffer size. @@ -67,10 +67,10 @@ final class ExtendedBufferedReader extends UnsynchronizedBufferedReader { super(reader); } - ExtendedBufferedReader(final Reader reader, String encoding) { + ExtendedBufferedReader(final Reader reader, Charset charset) { super(reader); - if (encoding != null) { - encoder = Charset.forName(encoding).newEncoder(); + if (charset != null) { + encoder = charset.newEncoder(); } } @@ -146,20 +146,30 @@ public int read() throws IOException { } /** - * In Java, a char data type are based on the original Unicode - * specification, which defined characters as fixed-width 16-bit entities. - * U+0000 to U+FFFF: - * - BMP, represented using 1 16-bit char - * - Consists of UTF-8 1-byte, 2-byte, some 3-byte chars - * U+10000 to U+10FFFF: - * - Supplementary characters, represented as a pair of characters, - * the first char from the high-surrogates range (\uD800-\uDBFF), - * and the second char from the low-surrogates range (uDC00-\uDFFF). - * - Consists of UTF-8 some 3-byte chars and 4-byte chars + * In Java, the {@code char} data type is based on the original Unicode + * specification, which defined characters as fixed-width 16-bit entities. + *
+ * The Unicode characters are divided into two main ranges: + *