From 610c022164b71614fb92366b3694df6d700283c8 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 15 Mar 2024 08:46:42 -0500 Subject: [PATCH] This fixes an NPE when trying to read empty JSON data by adding a new API for missing information (#15307) CUDF cannot create a table with rows and no columns, but that is exactly what we need to be able to read some JSON input. So this adds in a new API that lets us work around this problem if we know how many rows you expect to see. This is not an ideal solutions so it not a fix for #5712 generically. But is is a stop gap, especially for cases when we know how many rows to expect. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/15307 --- java/src/main/java/ai/rapids/cudf/Table.java | 67 +++++++++++++++++-- .../java/ai/rapids/cudf/TableWithMeta.java | 7 +- .../test/java/ai/rapids/cudf/TableTest.java | 53 +++++++++++++++ 3 files changed, 118 insertions(+), 9 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index f3b4b9484ef..5ce2f9d2d6e 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1203,7 +1203,7 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest } } - private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) { + private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) { String[] neededColumns = schema.getColumnNames(); if (neededColumns == null || neededColumns.length == 0) { return twm.releaseTable(); @@ -1217,6 +1217,11 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) { DType[] types = schema.getChildTypes(); ColumnVector[] columns = new ColumnVector[neededColumns.length]; try (Table tbl = twm.releaseTable()) { + int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount(); + if (rowCount < 0) { + throw new IllegalStateException( + "No empty row count provided and the table read has no row count or columns"); + } for (int i = 0; i < columns.length; i++) { String neededColumnName = neededColumns[i]; Integer index = indices.get(neededColumnName); @@ -1234,7 +1239,7 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) { } } else { try (Scalar s = Scalar.fromNull(types[i])) { - columns[i] = ColumnVector.fromScalar(s, (int)tbl.getRowCount()); + columns[i] = ColumnVector.fromScalar(s, rowCount); } } } @@ -1268,7 +1273,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) { - return gatherJSONColumns(schema, twm); + return gatherJSONColumns(schema, twm, -1); } } @@ -1284,6 +1289,23 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) { */ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len, HostMemoryAllocator hostMemoryAllocator) { + return readJSON(schema, opts, buffer, offset, len, hostMemoryAllocator, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @param hostMemoryAllocator allocator for host memory buffers + * @param emptyRowCount the number of rows to return if no columns were read. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, + long len, HostMemoryAllocator hostMemoryAllocator, + int emptyRowCount) { if (len <= 0) { len = buffer.length - offset; } @@ -1292,10 +1314,16 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon assert offset >= 0 && offset < buffer.length; try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) { newBuf.setBytes(0, buffer, offset, len); - return readJSON(schema, opts, newBuf, 0, len); + return readJSON(schema, opts, newBuf, 0, len, emptyRowCount); } } + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, + long len, int emptyRowCount) { + return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(), + emptyRowCount); + } + public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset, long len) { return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); @@ -1357,6 +1385,21 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) { */ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, long offset, long len) { + return readJSON(schema, opts, buffer, offset, len, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param buffer raw UTF8 formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @param emptyRowCount the number of rows to use if no columns were found. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer, + long offset, long len, int emptyRowCount) { if (len <= 0) { len = buffer.length - offset; } @@ -1370,7 +1413,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) { - return gatherJSONColumns(schema, twm); + return gatherJSONColumns(schema, twm, emptyRowCount); } } @@ -1382,13 +1425,25 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b * @return the data parsed as a table on the GPU. */ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) { + return readJSON(schema, opts, ds, -1); + } + + /** + * Read JSON formatted data. + * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema. + * @param opts various JSON parsing options. + * @param ds the DataSource to read from. + * @param emtpyRowCount the number of rows to return if no columns were read. + * @return the data parsed as a table on the GPU. + */ + public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) { long dsHandle = DataSourceHelper.createWrapperDataSource(ds); try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(), opts.isNormalizeWhitespace(), opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) { - return gatherJSONColumns(schema, twm); + return gatherJSONColumns(schema, twm, emtpyRowCount); } finally { DataSourceHelper.destroyWrapperDataSource(dsHandle); } diff --git a/java/src/main/java/ai/rapids/cudf/TableWithMeta.java b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java index 040fa68f01e..c3fe2669132 100644 --- a/java/src/main/java/ai/rapids/cudf/TableWithMeta.java +++ b/java/src/main/java/ai/rapids/cudf/TableWithMeta.java @@ -19,8 +19,6 @@ package ai.rapids.cudf; -import java.util.Arrays; - /** * A table along with some metadata about the table. This is typically returned when * reading data from an input file where the metadata can be important. @@ -80,7 +78,7 @@ public String toString() { */ public Table releaseTable() { long[] ptr = releaseTable(handle); - if (ptr == null) { + if (ptr == null || ptr.length == 0) { return null; } else { return new Table(ptr); @@ -120,6 +118,9 @@ NestedChildren getChildren() { String[] flatNames = getFlattenedColumnNames(handle); ChildAndOffset tmp = unflatten(0, flatNames, flatCount); children = tmp.child; + if (children == null) { + children = new NestedChildren(new String[0], new NestedChildren[0]); + } } return children; } diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index d06ea05144b..30905783c7f 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -331,6 +331,59 @@ void testReadJSONFile() { } } + private static final byte[] EMPTY_JSON_DATA_BUFFER = ("{}\n").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadEmptyJson() { + Schema schema = Schema.builder() + .column(DType.STRING, "name") + .build(); + JSONOptions opts = JSONOptions.builder() + .withKeepQuotes(true) + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeSingleQuotes(true) + .withNormalizeWhitespace(true) + .withLines(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column((String)null) + .build(); + Table table = Table.readJSON(schema, opts, EMPTY_JSON_DATA_BUFFER, 0, + EMPTY_JSON_DATA_BUFFER.length, 1)) { + assertTablesAreEqual(expected, table); + } + } + + private static final byte[] EMPTY_ARRAY_JSON_DATA_BUFFER = + ("{'a':[]}\n").getBytes(StandardCharsets.UTF_8); + + @Test + void testReadEmptyArrayJson() { + Schema.Builder builder = Schema.builder(); + Schema.Builder listBuilder = builder.addColumn(DType.LIST, "a"); + // INT8 is selected here because CUDF always returns INT8 for this no matter what we ask for. + listBuilder.addColumn(DType.INT8, "child"); + Schema schema = builder.build(); + JSONOptions opts = JSONOptions.builder() + .withKeepQuotes(true) + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeSingleQuotes(true) + .withNormalizeWhitespace(true) + .withLines(true) + .build(); + ListType lt = new ListType(true, new BasicType(true, DType.INT8)); + try (Table expected = new Table.TestBuilder() + .column(lt, new ArrayList()) + .build(); + Table table = Table.readJSON(schema, opts, EMPTY_ARRAY_JSON_DATA_BUFFER, 0, + EMPTY_ARRAY_JSON_DATA_BUFFER.length, 1)) { + TableDebug.get().debug("OUTPUT", table); + assertTablesAreEqual(expected, table); + } + } + @Test void testReadSingleQuotesJSONFile() throws IOException { Schema schema = Schema.builder()