Skip to content

Commit

Permalink
This fixes an NPE when trying to read empty JSON data by adding a new…
Browse files Browse the repository at this point in the history
… API for missing information (#15307)

CUDF cannot create a table with rows and no columns, but that is exactly what we need to be able to read some JSON input. So this adds in a new API that lets us work around this problem if we know how many rows you expect to see. This is not an ideal solutions so it not a fix for #5712 generically.  But is is a stop gap, especially for cases when we know how many rows to expect.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: #15307
  • Loading branch information
revans2 authored Mar 15, 2024
1 parent 95ce0bb commit 610c022
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 9 deletions.
67 changes: 61 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -1203,7 +1203,7 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
}
}

private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) {
String[] neededColumns = schema.getColumnNames();
if (neededColumns == null || neededColumns.length == 0) {
return twm.releaseTable();
Expand All @@ -1217,6 +1217,11 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
DType[] types = schema.getChildTypes();
ColumnVector[] columns = new ColumnVector[neededColumns.length];
try (Table tbl = twm.releaseTable()) {
int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount();
if (rowCount < 0) {
throw new IllegalStateException(
"No empty row count provided and the table read has no row count or columns");
}
for (int i = 0; i < columns.length; i++) {
String neededColumnName = neededColumns[i];
Integer index = indices.get(neededColumnName);
Expand All @@ -1234,7 +1239,7 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
}
} else {
try (Scalar s = Scalar.fromNull(types[i])) {
columns[i] = ColumnVector.fromScalar(s, (int)tbl.getRowCount());
columns[i] = ColumnVector.fromScalar(s, rowCount);
}
}
}
Expand Down Expand Up @@ -1268,7 +1273,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes()))) {

return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, -1);
}
}

Expand All @@ -1284,6 +1289,23 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, HostMemoryAllocator hostMemoryAllocator) {
return readJSON(schema, opts, buffer, offset, len, hostMemoryAllocator, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param buffer raw UTF8 formatted bytes.
* @param offset the starting offset into buffer.
* @param len the number of bytes to parse.
* @param hostMemoryAllocator allocator for host memory buffers
* @param emptyRowCount the number of rows to return if no columns were read.
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, HostMemoryAllocator hostMemoryAllocator,
int emptyRowCount) {
if (len <= 0) {
len = buffer.length - offset;
}
Expand All @@ -1292,10 +1314,16 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
assert offset >= 0 && offset < buffer.length;
try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
newBuf.setBytes(0, buffer, offset, len);
return readJSON(schema, opts, newBuf, 0, len);
return readJSON(schema, opts, newBuf, 0, len, emptyRowCount);
}
}

public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, int emptyRowCount) {
return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(),
emptyRowCount);
}

public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len) {
return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
Expand Down Expand Up @@ -1357,6 +1385,21 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
long offset, long len) {
return readJSON(schema, opts, buffer, offset, len, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param buffer raw UTF8 formatted bytes.
* @param offset the starting offset into buffer.
* @param len the number of bytes to parse.
* @param emptyRowCount the number of rows to use if no columns were found.
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
long offset, long len, int emptyRowCount) {
if (len <= 0) {
len = buffer.length - offset;
}
Expand All @@ -1370,7 +1413,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isNormalizeWhitespace(),
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}

Expand All @@ -1382,13 +1425,25 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
return readJSON(schema, opts, ds, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param ds the DataSource to read from.
* @param emtpyRowCount the number of rows to return if no columns were read.
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isNormalizeWhitespace(),
opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, emtpyRowCount);
} finally {
DataSourceHelper.destroyWrapperDataSource(dsHandle);
}
Expand Down
7 changes: 4 additions & 3 deletions java/src/main/java/ai/rapids/cudf/TableWithMeta.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

package ai.rapids.cudf;

import java.util.Arrays;

/**
* A table along with some metadata about the table. This is typically returned when
* reading data from an input file where the metadata can be important.
Expand Down Expand Up @@ -80,7 +78,7 @@ public String toString() {
*/
public Table releaseTable() {
long[] ptr = releaseTable(handle);
if (ptr == null) {
if (ptr == null || ptr.length == 0) {
return null;
} else {
return new Table(ptr);
Expand Down Expand Up @@ -120,6 +118,9 @@ NestedChildren getChildren() {
String[] flatNames = getFlattenedColumnNames(handle);
ChildAndOffset tmp = unflatten(0, flatNames, flatCount);
children = tmp.child;
if (children == null) {
children = new NestedChildren(new String[0], new NestedChildren[0]);
}
}
return children;
}
Expand Down
53 changes: 53 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,59 @@ void testReadJSONFile() {
}
}

private static final byte[] EMPTY_JSON_DATA_BUFFER = ("{}\n").getBytes(StandardCharsets.UTF_8);

@Test
void testReadEmptyJson() {
Schema schema = Schema.builder()
.column(DType.STRING, "name")
.build();
JSONOptions opts = JSONOptions.builder()
.withKeepQuotes(true)
.withRecoverWithNull(true)
.withMixedTypesAsStrings(true)
.withNormalizeSingleQuotes(true)
.withNormalizeWhitespace(true)
.withLines(true)
.build();
try (Table expected = new Table.TestBuilder()
.column((String)null)
.build();
Table table = Table.readJSON(schema, opts, EMPTY_JSON_DATA_BUFFER, 0,
EMPTY_JSON_DATA_BUFFER.length, 1)) {
assertTablesAreEqual(expected, table);
}
}

private static final byte[] EMPTY_ARRAY_JSON_DATA_BUFFER =
("{'a':[]}\n").getBytes(StandardCharsets.UTF_8);

@Test
void testReadEmptyArrayJson() {
Schema.Builder builder = Schema.builder();
Schema.Builder listBuilder = builder.addColumn(DType.LIST, "a");
// INT8 is selected here because CUDF always returns INT8 for this no matter what we ask for.
listBuilder.addColumn(DType.INT8, "child");
Schema schema = builder.build();
JSONOptions opts = JSONOptions.builder()
.withKeepQuotes(true)
.withRecoverWithNull(true)
.withMixedTypesAsStrings(true)
.withNormalizeSingleQuotes(true)
.withNormalizeWhitespace(true)
.withLines(true)
.build();
ListType lt = new ListType(true, new BasicType(true, DType.INT8));
try (Table expected = new Table.TestBuilder()
.column(lt, new ArrayList<Byte>())
.build();
Table table = Table.readJSON(schema, opts, EMPTY_ARRAY_JSON_DATA_BUFFER, 0,
EMPTY_ARRAY_JSON_DATA_BUFFER.length, 1)) {
TableDebug.get().debug("OUTPUT", table);
assertTablesAreEqual(expected, table);
}
}

@Test
void testReadSingleQuotesJSONFile() throws IOException {
Schema schema = Schema.builder()
Expand Down

0 comments on commit 610c022

Please sign in to comment.