Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add java option to keep quotes for JSON reads #15146

Merged
merged 4 commits into from
Feb 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean recoverWithNull;
private final boolean normalizeSingleQuotes;
private final boolean mixedTypesAsStrings;
private final boolean keepStringQuotes;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) {
recoverWithNull = builder.recoverWithNull;
normalizeSingleQuotes = builder.normalizeSingleQuotes;
mixedTypesAsStrings = builder.mixedTypesAsStrings;
keepStringQuotes = builder.keepQuotes;
}

public boolean isDayFirst() {
Expand All @@ -63,6 +65,10 @@ public boolean isMixedTypesAsStrings() {
return mixedTypesAsStrings;
}

public boolean keepStringQuotes() {
return keepStringQuotes;
}

@Override
String[] getIncludeColumnNames() {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand All @@ -80,6 +86,7 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean normalizeSingleQuotes = false;

private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

/**
* Whether to parse dates as DD/MM versus MM/DD
Expand Down Expand Up @@ -135,6 +142,16 @@ public Builder withMixedTypesAsStrings(boolean mixedTypesAsStrings) {
return this;
}

/**
* Set whether the reader should keep quotes of string values.
* @param keepQuotes true to keep them, else false.
* @return this for chaining.
*/
public Builder withKeepQuotes(boolean keepQuotes) {
this.keepQuotes = keepQuotes;
return this;
}

@Override
public Builder includeColumn(String... names) {
throw new UnsupportedOperationException("JSON reader didn't support column prune");
Expand Down
22 changes: 16 additions & 6 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -252,23 +252,31 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings) throws CudfException;
boolean mixedTypesAsStrings,
boolean keepStringQuotes) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes,
long dsHandle) throws CudfException;
private static native long readAndInferJSON(long address, long length,
boolean dayFirst, boolean lines, boolean recoverWithNulls, boolean normalizeSingleQuotes, boolean mixedTypesAsStrings) throws CudfException;
boolean dayFirst,
boolean lines,
boolean recoverWithNulls,
boolean normalizeSingleQuotes,
boolean mixedTypesAsStrings,
boolean keepStringQuotes) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1246,7 +1254,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
0, 0,
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()))) {
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes()))) {

return gatherJSONColumns(schema, twm);
}
Expand Down Expand Up @@ -1300,7 +1309,7 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
return new TableWithMeta(readAndInferJSON(buffer.getAddress() + offset, len,
opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()));
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
}

/**
Expand All @@ -1316,6 +1325,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
opts.isRecoverWithNull(),
opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings(),
opts.keepStringQuotes(),
dsHandle));
return twm;
} finally {
Expand Down Expand Up @@ -1345,7 +1355,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings()))) {
opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
return gatherJSONColumns(schema, twm);
}
}
Expand All @@ -1362,7 +1372,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
opts.isMixedTypesAsStrings(), dsHandle))) {
opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
return gatherJSONColumns(schema, twm);
} finally {
DataSourceHelper.destroyWrapperDataSource(dsHandle);
Expand Down
19 changes: 13 additions & 6 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1429,7 +1429,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
jlong ds_handle) {

JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);

Expand All @@ -1447,6 +1448,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.keep_quotes(keep_quotes)
.mixed_types_as_string(mixed_types_as_string);

auto result =
Expand All @@ -1459,7 +1461,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource

JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string,
jboolean keep_quotes) {

JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1481,6 +1484,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.keep_quotes(keep_quotes)
.mixed_types_as_string(mixed_types_as_string);

auto result =
Expand Down Expand Up @@ -1569,7 +1573,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jlong ds_handle) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes,
jlong ds_handle) {

JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);

Expand Down Expand Up @@ -1601,7 +1606,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.mixed_types_as_string(mixed_types_as_string);
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
if (n_types.size() != n_scales.size()) {
Expand Down Expand Up @@ -1640,7 +1646,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
jboolean day_first, jboolean lines, jboolean recover_with_null,
jboolean normalize_single_quotes, jboolean mixed_types_as_string) {
jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) {

bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1687,7 +1693,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
.lines(static_cast<bool>(lines))
.recovery_mode(recovery_mode)
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.mixed_types_as_string(mixed_types_as_string);
.mixed_types_as_string(mixed_types_as_string)
.keep_quotes(keep_quotes);

if (!n_types.is_null()) {
if (n_types.size() != n_scales.size()) {
Expand Down
19 changes: 19 additions & 0 deletions java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,25 @@ void testReadSingleQuotesJSONFile() throws IOException {
}
}

@Test
void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
Schema schema = Schema.builder()
.column(DType.STRING, "A")
.build();
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withNormalizeSingleQuotes(true)
.withKeepQuotes(true)
.build();
try (Table expected = new Table.TestBuilder()
.column("\"TEST\"\"", "\"TESTER'\"") // Note that escapes are also processed
.build();
MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE);
Table table = Table.readJSON(schema, opts, source)) {
assertTablesAreEqual(expected, table);
}
}

private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
"{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
"{\"d\":[1,2,3]}\n" +
Expand Down
Loading