diff --git a/README.md b/README.md index 646646a..3b64e4d 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,15 @@ CsvMapper mapper = new CsvMapper(); Pojo value = ...; CsvSchema schema = mapper.schemaFor(Pojo.class); // schema from 'Pojo' definition String csv = mapper.writer(schema).writeValueAsString(value); -Pojo result = mapper.readerFor(Pojo.class).with(schema).read(csv); +MappingIterator it = mapper.readerFor(Pojo.class).with(schema) + .readValues(csv); +// Either read them all one by one (streaming) +while (it.hasNextValue()) { + Pojo value = it.nextValue(); + // ... do something with the value +} +// or, alternatively all in one go +List all = it.readAll(); ``` ## Data-binding without schema @@ -235,7 +243,8 @@ Jackson supports following extension or variations: # Limitations * Due to tabular nature of `CSV` format, deeply nested data structures are not well supported. -* Use of Tree Model (`JsonNode`) is supported, but only within limitations of `CSv` format. + * You can use `@JsonUnwrapped` to get around this +* Use of Tree Model (`JsonNode`) is supported, but only within limitations of `CSV` format. # Future improvements diff --git a/release-notes/VERSION b/release-notes/VERSION index 8b2e676..89520c0 100644 --- a/release-notes/VERSION +++ b/release-notes/VERSION @@ -11,6 +11,8 @@ Project: jackson-dataformat-csv (contributed by georgewfraser@github) #130: Add fluent addColumns operation to CsvSchema.Builder (contributed by Peter A) +#137: Inject "missing" trailing columns as `null`s + (`JsonParser.Feature.INSERT_NULLS_FOR_MISSING_COLUMNS`) #139: Add `CsvParser.Feature.ALLOW_TRAILING_COMMA` to allow enforcing strict handling (contributed by Nick B) #142: Add methods for appending columns of a `CsvSchema` into another diff --git a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java index 7c2d0f5..8a8f280 100644 --- a/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java +++ b/src/main/java/com/fasterxml/jackson/dataformat/csv/CsvParser.java @@ -76,6 +76,20 @@ public enum Feature * IGNORE_TRAILING_UNMAPPABLE is also disabled. */ ALLOW_TRAILING_COMMA(true), + + /** + * Feature that allows "inserting" virtual key / `null` value pairs in case + * a row contains fewer columns than declared by configured schema. + * This typically has the effect of forcing an explicit `null` assigment (or + * corresponding "null value", if so configured) at databinding level. + * If disabled, no extra work is done and values for "missing" columns are + * not exposed as part of the token stream. + *

+ * Feature is disabled by default. + * + * @since 2.9 + */ + INSERT_NULLS_FOR_MISSING_COLUMNS(false), ; final boolean _defaultState; @@ -172,6 +186,24 @@ private Feature(boolean defaultState) { */ protected final static int STATE_SKIP_EXTRA_COLUMNS = 6; + /** + * State in which we should expose name token for a "missing column" + * (for which placeholder `null` value is to be added as well); + * see {@link Feature#INSERT_NULLS_FOR_MISSING_COLUMNS} for details. + * + * @since 2.9 + */ + protected final static int STATE_MISSING_NAME = 7; + + /** + * State in which we should expose `null` value token as a value for + * "missing" column; + * see {@link Feature#INSERT_NULLS_FOR_MISSING_COLUMNS} for details. + * + * @since 2.9 + */ + protected final static int STATE_MISSING_VALUE = 8; + /** * State in which end marker is returned; either * null (if no array wrapping), or @@ -179,7 +211,7 @@ private Feature(boolean defaultState) { * This step will loop, returning series of nulls * if {@link #nextToken} is called multiple times. */ - protected final static int STATE_DOC_END = 7; + protected final static int STATE_DOC_END = 9; /* /********************************************************************** @@ -539,6 +571,10 @@ public JsonToken nextToken() throws IOException case STATE_SKIP_EXTRA_COLUMNS: // Need to just skip whatever remains return _skipUntilEndOfLine(); + case STATE_MISSING_NAME: + return (_currToken = _handleMissingName()); + case STATE_MISSING_VALUE: + return (_currToken = _handleMissingValue()); case STATE_DOC_END: _reader.close(); if (_parsingContext.inRoot()) { @@ -618,10 +654,81 @@ public String nextTextValue() throws IOException /* /********************************************************** - /* Parsing, helper methods + /* Parsing, helper methods, regular /********************************************************** */ - + + /** + * Method called to process the expected header line + */ + protected void _readHeaderLine() throws IOException { + /* + When the header line is present and the settings ask for it + to be processed, two different options are possible: + + a) The schema has been populated. In this case, build a new + schema where the order matches the *actual* order in which + the given CSV file offers its columns, iif _schema.reordersColumns() + is set to true; there cases the consumer of the csv file + knows about the columns but not necessarily the order in + which they are defined. + + b) The schema has not been populated. In this case, build a + default schema based on the columns found in the header. + */ + + if (_schema.size() > 0 && !_schema.reordersColumns()) { + if (_schema.strictHeaders()) { + String name; + for (CsvSchema.Column column : _schema._columns) { + name = _reader.nextString(); + if (name == null) { + _reportError(String.format("Missing header %s", column.getName())); + } else if (!column.getName().equals(name)) { + _reportError(String.format("Expected header %s, actual header %s", column.getName(), name)); + } + } + if ((name = _reader.nextString()) != null) { + _reportError(String.format("Extra header %s", name)); + } + } + else { + //noinspection StatementWithEmptyBody + while (_reader.nextString() != null) { /* does nothing */ } + } + return; + } + + // either the schema is empty or reorder columns flag is set + String name; + CsvSchema.Builder builder = _schema.rebuild().clearColumns(); + + while ((name = _reader.nextString()) != null) { + // one more thing: always trim names, regardless of config settings + name = name.trim(); + + // See if "old" schema defined type; if so, use that type... + CsvSchema.Column prev = _schema.column(name); + if (prev != null) { + builder.addColumn(name, prev.getType()); + } else { + builder.addColumn(name); + } + } + + // Ok: did we get any columns? + CsvSchema newSchema = builder.build(); + int size = newSchema.size(); + if (size < 2) { // 1 just because we may get 'empty' header name + String first = (size == 0) ? "" : newSchema.columnName(0).trim(); + if (first.length() == 0) { + _reportMappingError("Empty header line: can not bind data"); + } + } + // otherwise we will use what we got + setSchema(builder.build()); + } + /** * Method called to handle details of initializing things to return * the very first token. @@ -692,15 +799,13 @@ protected JsonToken _handleNextEntry() throws IOException } if (next == null) { // end of record or input... - _parsingContext = _parsingContext.getParent(); - // let's handle EOF or linefeed - if (!_reader.startNewLine()) { - _state = STATE_DOC_END; - } else { - // no, just end of record - _state = STATE_RECORD_START; + // 16-Mar-2017, tatu: [dataformat-csv#137] Missing column(s)? + if (_columnIndex < _columnCount) { + if (Feature.INSERT_NULLS_FOR_MISSING_COLUMNS.enabledIn(_formatFeatures)) { + return _injectMissingColumns(); + } } - return JsonToken.END_OBJECT; + return _handleObjectRowEnd(); } _currentValue = next; if (_columnIndex >= _columnCount) { @@ -711,57 +816,6 @@ protected JsonToken _handleNextEntry() throws IOException return JsonToken.FIELD_NAME; } - /** - * Helper method called when an extraneous column value is found. - * What happens then depends on configuration, but there are three - * main choices: ignore value (and rest of line); expose extra value - * as "any property" using configured name, or throw an exception. - * - * @since 2.7 - */ - protected JsonToken _handleExtraColumn(String value) throws IOException - { - // If "any properties" enabled, expose as such - String anyProp = _schema.getAnyPropertyName(); - if (anyProp != null) { - _currentName = anyProp; - _state = STATE_NAMED_VALUE; - return JsonToken.FIELD_NAME; - } - - _currentName = null; - // With [dataformat-csv#95] we'll simply ignore extra - if (Feature.IGNORE_TRAILING_UNMAPPABLE.enabledIn(_formatFeatures)) { - _state = STATE_SKIP_EXTRA_COLUMNS; - return _skipUntilEndOfLine(); - } - - // 14-Mar-2012, tatu: As per [dataformat-csv#1], let's allow one specific case - // of extra: if we get just one all-whitespace entry, that can be just skipped - _state = STATE_SKIP_EXTRA_COLUMNS; - if (_columnIndex == _columnCount && Feature.ALLOW_TRAILING_COMMA.enabledIn(_formatFeatures)) { - value = value.trim(); - if (value.isEmpty()) { - // if so, need to verify we then get the end-of-record; - // easiest to do by just calling ourselves again... - String next = _reader.nextString(); - if (next == null) { // should end of record or input - _parsingContext = _parsingContext.getParent(); - if (!_reader.startNewLine()) { - _state = STATE_DOC_END; - } else { - _state = STATE_RECORD_START; - } - return JsonToken.END_OBJECT; - } - } - } - - // 21-May-2015, tatu: Need to enter recovery mode, to skip remainder of the line - _reportMappingError("Too many entries: expected at most "+_columnCount+" (value #"+_columnCount+" ("+value.length()+" chars) \""+value+"\")"); - return null; - } - protected JsonToken _handleNamedValue() throws IOException { // 06-Oct-2015, tatu: During recovery, may get past all regular columns, @@ -849,75 +903,114 @@ protected JsonToken _handleArrayValue() throws IOException return JsonToken.VALUE_STRING; } - /** - * Method called to process the expected header line + /* + /********************************************************** + /* Parsing, helper methods, extra column(s) + /********************************************************** */ - protected void _readHeaderLine() throws IOException { - /* - When the header line is present and the settings ask for it - to be processed, two different options are possible: - - a) The schema has been populated. In this case, build a new - schema where the order matches the *actual* order in which - the given CSV file offers its columns, iif _schema.reordersColumns() - is set to true; there cases the consumer of the csv file - knows about the columns but not necessarily the order in - which they are defined. - b) The schema has not been populated. In this case, build a - default schema based on the columns found in the header. - */ + /** + * Helper method called when an extraneous column value is found. + * What happens then depends on configuration, but there are three + * main choices: ignore value (and rest of line); expose extra value + * as "any property" using configured name, or throw an exception. + * + * @since 2.7 + */ + protected JsonToken _handleExtraColumn(String value) throws IOException + { + // If "any properties" enabled, expose as such + String anyProp = _schema.getAnyPropertyName(); + if (anyProp != null) { + _currentName = anyProp; + _state = STATE_NAMED_VALUE; + return JsonToken.FIELD_NAME; + } + _currentName = null; + // With [dataformat-csv#95] we'll simply ignore extra + if (Feature.IGNORE_TRAILING_UNMAPPABLE.enabledIn(_formatFeatures)) { + _state = STATE_SKIP_EXTRA_COLUMNS; + return _skipUntilEndOfLine(); + } - if (_schema.size() > 0 && !_schema.reordersColumns()) { - if (_schema.strictHeaders()) { - String name; - for (CsvSchema.Column column : _schema._columns) { - name = _reader.nextString(); - if (name == null) { - _reportError(String.format("Missing header %s", column.getName())); - } else if (!column.getName().equals(name)) { - _reportError(String.format("Expected header %s, actual header %s", column.getName(), name)); - } - } - if ((name = _reader.nextString()) != null) { - _reportError(String.format("Extra header %s", name)); + // 14-Mar-2012, tatu: As per [dataformat-csv#1], let's allow one specific case + // of extra: if we get just one all-whitespace entry, that can be just skipped + _state = STATE_SKIP_EXTRA_COLUMNS; + if (_columnIndex == _columnCount && Feature.ALLOW_TRAILING_COMMA.enabledIn(_formatFeatures)) { + value = value.trim(); + if (value.isEmpty()) { + // if so, need to verify we then get the end-of-record; + // easiest to do by just calling ourselves again... + String next = _reader.nextString(); + if (next == null) { // should end of record or input + return _handleObjectRowEnd(); } } - else { - //noinspection StatementWithEmptyBody - while (_reader.nextString() != null) { /* does nothing */ } - } - return; } + // 21-May-2015, tatu: Need to enter recovery mode, to skip remainder of the line + _reportMappingError("Too many entries: expected at most %s (value #%d (%d chars) \"%s\")", + _columnCount, _columnCount, value.length(), value); + return null; + } - // either the schema is empty or reorder columns flag is set - String name; - CsvSchema.Builder builder = _schema.rebuild().clearColumns(); + /* + /********************************************************** + /* Parsing, helper methods, missing column(s) + /********************************************************** + */ - while ((name = _reader.nextString()) != null) { - // one more thing: always trim names, regardless of config settings - name = name.trim(); + /** + * Helper method called when end of row occurs before finding values for + * all schema-specified columns. + * + * @since 2.9 + */ + protected JsonToken _injectMissingColumns() throws IOException + { + _state = STATE_MISSING_VALUE; + _currentName = _schema.columnName(_columnIndex); + _currentValue = null; + return JsonToken.FIELD_NAME; + } - // See if "old" schema defined type; if so, use that type... - CsvSchema.Column prev = _schema.column(name); - if (prev != null) { - builder.addColumn(name, prev.getType()); - } else { - builder.addColumn(name); - } + protected JsonToken _handleMissingName() throws IOException + { + if (++_columnIndex < _columnCount) { + _state = STATE_MISSING_VALUE; + _currentName = _schema.columnName(_columnIndex); + // _currentValue already set to null earlier + return JsonToken.FIELD_NAME; } + return _handleObjectRowEnd(); + } - // Ok: did we get any columns? - CsvSchema newSchema = builder.build(); - int size = newSchema.size(); - if (size < 2) { // 1 just because we may get 'empty' header name - String first = (size == 0) ? "" : newSchema.columnName(0).trim(); - if (first.length() == 0) { - _reportMappingError("Empty header line: can not bind data"); - } + protected JsonToken _handleMissingValue() throws IOException + { + _state = STATE_MISSING_NAME; + return JsonToken.VALUE_NULL; + } + + /* + /********************************************************** + /* Parsing, helper methods: row end handling, recover + /********************************************************** + */ + + /** + * Helper method called to handle details of state update when end of logical + * record occurs. + * + * @since 2.9 + */ + protected final JsonToken _handleObjectRowEnd() throws IOException + { + _parsingContext = _parsingContext.getParent(); + if (!_reader.startNewLine()) { + _state = STATE_DOC_END; + } else { + _state = STATE_RECORD_START; } - // otherwise we will use what we got - setSchema(builder.build()); + return JsonToken.END_OBJECT; } protected final JsonToken _skipUntilEndOfLine() throws IOException @@ -932,13 +1025,14 @@ protected final JsonToken _skipUntilEndOfLine() throws IOException return (_currToken = _parsingContext.inArray() ? JsonToken.END_ARRAY : JsonToken.END_OBJECT); } - + /* /********************************************************** /* String value handling /********************************************************** */ - + + // For now we do not store char[] representation... @Override public boolean hasTextCharacters() { @@ -1006,7 +1100,7 @@ public byte[] getBinaryValue(Base64Variant variant) throws IOException { if (_binaryValue == null) { if (_currToken != JsonToken.VALUE_STRING) { - _reportMappingError("Current token ("+_currToken+") not VALUE_STRING, can not access as binary"); + _reportMappingError("Current token (%s) not VALUE_STRING, can not access as binary", _currToken); } ByteArrayBuilder builder = _getByteArrayBuilder(); _decodeBase64(_currentValue, builder, variant); @@ -1086,7 +1180,10 @@ protected void _handleEOF() throws JsonParseException { * * @since 2.7 */ - public void _reportMappingError(String msg) throws JsonProcessingException { + public void _reportMappingError(String msg, Object... args) throws JsonProcessingException { + if (args.length > 0) { + msg = String.format(msg, args); + } throw JsonMappingException.from(this, msg); } diff --git a/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/MissingColumnsTest.java b/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/MissingColumnsTest.java new file mode 100644 index 0000000..acd8ea5 --- /dev/null +++ b/src/test/java/com/fasterxml/jackson/dataformat/csv/deser/MissingColumnsTest.java @@ -0,0 +1,73 @@ +package com.fasterxml.jackson.dataformat.csv.deser; + +import com.fasterxml.jackson.annotation.JsonPropertyOrder; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.dataformat.csv.CsvMapper; +import com.fasterxml.jackson.dataformat.csv.CsvParser; +import com.fasterxml.jackson.dataformat.csv.CsvSchema; +import com.fasterxml.jackson.dataformat.csv.ModuleTestBase; + +/** + * Tests for cases where one more of schema-declared columns is + * missing. + */ +public class MissingColumnsTest extends ModuleTestBase +{ + @JsonPropertyOrder({ "a", "b", "c", "d" }) + static class ABCD { + public String a = "a"; + public String b = "b"; + public String c = "c"; + public String d = "d"; + } + + /* + /********************************************************************** + /* Test methods + /********************************************************************** + */ + + final CsvMapper MAPPER = mapperForCsv(); + + final CsvSchema schema = MAPPER.schemaFor(ABCD.class); + + // [dataformat-csv#137]: inject `null`s in place of missing + public void testInjectMissingAsNulls() throws Exception + { + + ObjectReader r = MAPPER.readerFor(ABCD.class) + .with(schema) + .with(CsvParser.Feature.INSERT_NULLS_FOR_MISSING_COLUMNS); + + // check with various number of missing; but first with no missing + ABCD result = r.readValue("first,second,third,fourth\n"); + assertEquals("third", result.c); + assertEquals("fourth", result.d); + + // then with one missing + result = r.readValue("first,second,third\n"); + assertEquals("third", result.c); + assertNull(result.d); + + // two + result = r.readValue("first,second\n"); + assertEquals("second", result.b); + assertNull(result.c); + assertNull(result.d); + + // etc + result = r.readValue("first\n"); + assertEquals("first", result.a); + assertNull(result.b); + assertNull(result.c); + assertNull(result.d); + + result = r.readValue("\n"); + // 16-Mar-2017, tatu: Actually first value is just empty, not null... since + // logical "empty String" does exist no matter what. + assertEquals("", result.a); + assertNull(result.b); + assertNull(result.c); + assertNull(result.d); + } +}