Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add in option for Java JSON APIs to do column pruning in CUDF #16796

Merged
merged 5 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final boolean cudfPruneSchema;
private final byte lineDelimiter;

private JSONOptions(Builder builder) {
Expand All @@ -53,9 +54,14 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
cudfPruneSchema = builder.cudfPruneSchema;
lineDelimiter = builder.lineDelimiter;
}

public boolean shouldCudfPruneSchema() {
return cudfPruneSchema;
}

public byte getLineDelimiter() {
return lineDelimiter;
}
Expand Down Expand Up @@ -129,8 +135,14 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private boolean cudfPruneSchema = false;
private byte lineDelimiter = '\n';

public Builder withCudfPruneSchema(boolean prune) {
cudfPruneSchema = prune;
return this;
}

public Builder withLineDelimiter(char delimiter) {
if (delimiter > Byte.MAX_VALUE) {
throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
Expand Down
17 changes: 17 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
Expand All @@ -273,6 +274,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter,
long dsHandle) throws CudfException;

Expand Down Expand Up @@ -1312,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
* @return the file parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, File path) {
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
Comment on lines +1318 to +1319
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since getColumnNames calls toArray, consider storing it in a local variable. Maybe we could introduce a utils method since this PR does it a few times.

opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(
readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
Expand All @@ -1326,6 +1332,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {

return gatherJSONColumns(schema, twm, -1);
Expand Down Expand Up @@ -1472,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSON(
schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
Expand All @@ -1487,6 +1498,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
Expand All @@ -1513,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
opts.isDayFirst(),
Expand All @@ -1526,6 +1542,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
Expand Down
12 changes: 9 additions & 3 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
abellina marked this conversation as resolved.
Show resolved Hide resolved
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1703,6 +1704,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.prune_columns(false)
.delimiter(static_cast<char>(line_delimiter))
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down Expand Up @@ -1818,6 +1820,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter,
jlong ds_handle)
{
Expand Down Expand Up @@ -1855,7 +1858,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1915,6 +1919,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter)
{
bool read_buffer = true;
Expand Down Expand Up @@ -1966,7 +1971,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down
Loading