Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This fixes an NPE when trying to read empty JSON data by adding a new API for missing information #15307

Merged
merged 4 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 61 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -1203,7 +1203,7 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
}
}

private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emptyRowCount) {
String[] neededColumns = schema.getColumnNames();
if (neededColumns == null || neededColumns.length == 0) {
return twm.releaseTable();
Expand All @@ -1217,6 +1217,11 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
DType[] types = schema.getChildTypes();
ColumnVector[] columns = new ColumnVector[neededColumns.length];
try (Table tbl = twm.releaseTable()) {
int rowCount = tbl == null ? emptyRowCount : (int)tbl.getRowCount();
if (rowCount < 0) {
throw new IllegalStateException(
"No empty row count provided and the table read has no row count or columns");
}
for (int i = 0; i < columns.length; i++) {
String neededColumnName = neededColumns[i];
Integer index = indices.get(neededColumnName);
Expand All @@ -1234,7 +1239,7 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) {
}
} else {
try (Scalar s = Scalar.fromNull(types[i])) {
columns[i] = ColumnVector.fromScalar(s, (int)tbl.getRowCount());
columns[i] = ColumnVector.fromScalar(s, rowCount);
}
}
}
Expand Down Expand Up @@ -1268,7 +1273,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes()))) {

return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, -1);
}
}

Expand All @@ -1284,6 +1289,23 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, HostMemoryAllocator hostMemoryAllocator) {
return readJSON(schema, opts, buffer, offset, len, hostMemoryAllocator, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param buffer raw UTF8 formatted bytes.
* @param offset the starting offset into buffer.
* @param len the number of bytes to parse.
* @param hostMemoryAllocator allocator for host memory buffers
* @param emptyRowCount the number of rows to return if no columns were read.
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, HostMemoryAllocator hostMemoryAllocator,
int emptyRowCount) {
if (len <= 0) {
len = buffer.length - offset;
}
Expand All @@ -1292,10 +1314,16 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, lon
assert offset >= 0 && offset < buffer.length;
try (HostMemoryBuffer newBuf = hostMemoryAllocator.allocate(len)) {
newBuf.setBytes(0, buffer, offset, len);
return readJSON(schema, opts, newBuf, 0, len);
return readJSON(schema, opts, newBuf, 0, len, emptyRowCount);
}
}

public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len, int emptyRowCount) {
return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get(),
emptyRowCount);
}

public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer, long offset,
long len) {
return readJSON(schema, opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
Expand Down Expand Up @@ -1357,6 +1385,21 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
long offset, long len) {
return readJSON(schema, opts, buffer, offset, len, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param buffer raw UTF8 formatted bytes.
* @param offset the starting offset into buffer.
* @param len the number of bytes to parse.
* @param emptyRowCount the number of rows to use if no columns were found.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering if there might be a better name for this. Under the circumstances, emptyRowCount is really the best name for this parameter.

* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer buffer,
long offset, long len, int emptyRowCount) {
if (len <= 0) {
len = buffer.length - offset;
}
Expand All @@ -1370,7 +1413,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isNormalizeWhitespace(),
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}

Expand All @@ -1382,13 +1425,25 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
return readJSON(schema, opts, ds, -1);
}

/**
* Read JSON formatted data.
* @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
* @param opts various JSON parsing options.
* @param ds the DataSource to read from.
* @param emtpyRowCount the number of rows to return if no columns were read.
* @return the data parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isNormalizeWhitespace(),
opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
return gatherJSONColumns(schema, twm);
return gatherJSONColumns(schema, twm, emtpyRowCount);
} finally {
DataSourceHelper.destroyWrapperDataSource(dsHandle);
}
Expand Down
7 changes: 4 additions & 3 deletions java/src/main/java/ai/rapids/cudf/TableWithMeta.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@

package ai.rapids.cudf;

import java.util.Arrays;

/**
* A table along with some metadata about the table. This is typically returned when
* reading data from an input file where the metadata can be important.
Expand Down Expand Up @@ -80,7 +78,7 @@ public String toString() {
*/
public Table releaseTable() {
long[] ptr = releaseTable(handle);
if (ptr == null) {
if (ptr == null || ptr.length == 0) {
return null;
} else {
return new Table(ptr);
Expand Down Expand Up @@ -120,6 +118,9 @@ NestedChildren getChildren() {
String[] flatNames = getFlattenedColumnNames(handle);
ChildAndOffset tmp = unflatten(0, flatNames, flatCount);
children = tmp.child;
if (children == null) {
children = new NestedChildren(new String[0], new NestedChildren[0]);
}
}
return children;
}
Expand Down
53 changes: 53 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,59 @@ void testReadJSONFile() {
}
}

private static final byte[] EMPTY_JSON_DATA_BUFFER = ("{}\n").getBytes(StandardCharsets.UTF_8);

@Test
void testReadEmptyJson() {
Schema schema = Schema.builder()
.column(DType.STRING, "name")
.build();
JSONOptions opts = JSONOptions.builder()
.withKeepQuotes(true)
.withRecoverWithNull(true)
.withMixedTypesAsStrings(true)
.withNormalizeSingleQuotes(true)
.withNormalizeWhitespace(true)
.withLines(true)
.build();
try (Table expected = new Table.TestBuilder()
.column((String)null)
.build();
Table table = Table.readJSON(schema, opts, EMPTY_JSON_DATA_BUFFER, 0,
EMPTY_JSON_DATA_BUFFER.length, 1)) {
assertTablesAreEqual(expected, table);
}
}

private static final byte[] EMPTY_ARRAY_JSON_DATA_BUFFER =
("{'a':[]}\n").getBytes(StandardCharsets.UTF_8);

@Test
void testReadEmptyArrayJson() {
Schema.Builder builder = Schema.builder();
Schema.Builder listBuilder = builder.addColumn(DType.LIST, "a");
// INT8 is selected here because CUDF always returns INT8 for this no matter what we ask for.
listBuilder.addColumn(DType.INT8, "child");
Schema schema = builder.build();
JSONOptions opts = JSONOptions.builder()
.withKeepQuotes(true)
.withRecoverWithNull(true)
.withMixedTypesAsStrings(true)
.withNormalizeSingleQuotes(true)
.withNormalizeWhitespace(true)
.withLines(true)
.build();
ListType lt = new ListType(true, new BasicType(true, DType.INT8));
try (Table expected = new Table.TestBuilder()
.column(lt, new ArrayList<Byte>())
.build();
Table table = Table.readJSON(schema, opts, EMPTY_ARRAY_JSON_DATA_BUFFER, 0,
EMPTY_ARRAY_JSON_DATA_BUFFER.length, 1)) {
TableDebug.get().debug("OUTPUT", table);
assertTablesAreEqual(expected, table);
}
}

@Test
void testReadSingleQuotesJSONFile() throws IOException {
Schema schema = Schema.builder()
Expand Down
Loading