-
Notifications
You must be signed in to change notification settings - Fork 917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update JNI JSON reader column compatability for Spark #13477
Changes from all commits
16b591f
84cef2d
6f11a83
e15b217
fce04d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -233,7 +233,10 @@ private static native long[] readCSV(String[] columnNames, | |
byte comment, String[] nullValues, | ||
String[] trueValues, String[] falseValues) throws CudfException; | ||
|
||
private static native long[] readJSON(String[] columnNames, | ||
/** | ||
* read JSON data and return a pointer to a TableWithMeta object. | ||
*/ | ||
private static native long readJSON(String[] columnNames, | ||
int[] dTypeIds, int[] dTypeScales, | ||
String filePath, long address, long length, | ||
boolean dayFirst, boolean lines) throws CudfException; | ||
Comment on lines
240
to
242
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: These parameters are now mis-indented |
||
|
@@ -968,6 +971,42 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { | |
return readJSON(schema, opts, buffer, 0, buffer.length); | ||
} | ||
|
||
private static Table gatherJSONColumns(Schema schema, TableWithMeta twm) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be nice to have a comment on this method explaining it will release and close the underlying table of twm as a side-effect. |
||
String[] neededColumns = schema.getColumnNames(); | ||
if (neededColumns == null || neededColumns.length == 0) { | ||
return twm.releaseTable(); | ||
} else { | ||
String[] foundNames = twm.getColumnNames(); | ||
HashMap<String, Integer> indices = new HashMap<>(); | ||
for (int i = 0; i < foundNames.length; i++) { | ||
indices.put(foundNames[i], i); | ||
} | ||
mythrocks marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// We might need to rearrange the columns to match what we want. | ||
DType[] types = schema.getTypes(); | ||
ColumnVector[] columns = new ColumnVector[neededColumns.length]; | ||
try (Table tbl = twm.releaseTable()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Seems like this should be the first thing in the method so no matter what happens (NPE, whatever) we're closing the table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TableWithMetadata will close the table if we don't pull it out first. |
||
for (int i = 0; i < columns.length; i++) { | ||
String neededColumnName = neededColumns[i]; | ||
Integer index = indices.get(neededColumnName); | ||
if (index != null) { | ||
columns[i] = tbl.getColumn(index).incRefCount(); | ||
} else { | ||
try (Scalar s = Scalar.fromNull(types[i])) { | ||
columns[i] = ColumnVector.fromScalar(s, (int)tbl.getRowCount()); | ||
} | ||
} | ||
} | ||
return new Table(columns); | ||
} finally { | ||
for (ColumnVector c: columns) { | ||
if (c != null) { | ||
c.close(); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Read a JSON file. | ||
* @param schema the schema of the file. You may use Schema.INFERRED to infer the schema. | ||
|
@@ -976,11 +1015,14 @@ public static Table readJSON(Schema schema, JSONOptions opts, byte[] buffer) { | |
* @return the file parsed as a table on the GPU. | ||
*/ | ||
public static Table readJSON(Schema schema, JSONOptions opts, File path) { | ||
return new Table( | ||
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), | ||
path.getAbsolutePath(), | ||
0, 0, | ||
opts.isDayFirst(), opts.isLines())); | ||
try (TableWithMeta twm = new TableWithMeta( | ||
readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), | ||
path.getAbsolutePath(), | ||
0, 0, | ||
opts.isDayFirst(), opts.isLines()))) { | ||
|
||
return gatherJSONColumns(schema, twm); | ||
} | ||
} | ||
|
||
/** | ||
|
@@ -1043,9 +1085,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b | |
assert len > 0; | ||
assert len <= buffer.length - offset; | ||
assert offset >= 0 && offset < buffer.length; | ||
return new Table(readJSON(schema.getColumnNames(), schema.getTypeIds(), schema.getTypeScales(), | ||
null, buffer.getAddress() + offset, len, | ||
opts.isDayFirst(), opts.isLines())); | ||
try (TableWithMeta twm = new TableWithMeta(readJSON(schema.getColumnNames(), | ||
schema.getTypeIds(), schema.getTypeScales(), null, | ||
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines()))) { | ||
return gatherJSONColumns(schema, twm); | ||
} | ||
} | ||
|
||
/** | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Neat. That would pack nicely to: