From 49c2995b1b861b12d3b25ad997adec9c50ed872f Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Mon, 12 Feb 2024 09:46:56 -0800 Subject: [PATCH] Introduce `GetJsonObjectOptions` in `getJSONObject` Java API (#14956) Resolves [10219](https://github.com/NVIDIA/spark-rapids/issues/10219) This PR introduces a new class named `GetJsonObjectOptions` that holds the configurations to control the behavior of the underlying `cudf::get_json_object` function. It incorporates this new class into the `getJSONObject` JAVA API as an additional argument but also keeps the previous API to maintain backwards compatibility. It also includes a test case, `testGetJSONObjectWithSingleQuotes`, validating the behavior of `getJSONObject` when single quotes are enabled. Authors: - Suraj Aralihalli (https://github.com/SurajAralihalli) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - MithunR (https://github.com/mythrocks) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/14956 --- .../main/java/ai/rapids/cudf/ColumnView.java | 22 +++++- .../ai/rapids/cudf/GetJsonObjectOptions.java | 75 +++++++++++++++++++ java/src/main/native/src/ColumnViewJni.cpp | 12 ++- .../java/ai/rapids/cudf/ColumnVectorTest.java | 16 ++++ 4 files changed, 119 insertions(+), 6 deletions(-) create mode 100644 java/src/main/java/ai/rapids/cudf/GetJsonObjectOptions.java diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 8eabed7f364..997ff77bae3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2978,6 +2978,24 @@ public final ColumnVector repeatStrings(ColumnView repeatTimes) { repeatTimes.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @param path The GetJsonObjectOptions to control get_json_object behaviour + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(Scalar path, GetJsonObjectOptions options) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle(), options.isAllowSingleQuotes(), options.isStripQuotesFromSingleStrings(), options.isMissingFieldsAsNulls())); + } + /** * Apply a JSONPath string to all rows in an input strings column. * @@ -2992,7 +3010,7 @@ public final ColumnVector repeatStrings(ColumnView repeatTimes) { */ public final ColumnVector getJSONObject(Scalar path) { assert(type.equals(DType.STRING)) : "column type must be a String"; - return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); + return getJSONObject(path, GetJsonObjectOptions.DEFAULT); } /** @@ -4194,7 +4212,7 @@ private static native long repeatStringsWithColumnRepeatTimes(long stringsHandle long repeatTimesHandle); - private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; + private static native long getJSONObject(long viewHandle, long scalarHandle, boolean allowSingleQuotes, boolean stripQuotesFromSingleStrings, boolean missingFieldsAsNulls) throws CudfException; /** * Native method to parse and convert a timestamp column vector to string column vector. A unix diff --git a/java/src/main/java/ai/rapids/cudf/GetJsonObjectOptions.java b/java/src/main/java/ai/rapids/cudf/GetJsonObjectOptions.java new file mode 100644 index 00000000000..5f9a174b2d3 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/GetJsonObjectOptions.java @@ -0,0 +1,75 @@ +/* + * + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +public final class GetJsonObjectOptions { + + public static GetJsonObjectOptions DEFAULT = new GetJsonObjectOptions.Builder().build(); + + private final boolean allowSingleQuotes; + private final boolean stripQuotesFromSingleStrings; + private final boolean missingFieldsAsNulls; + + private GetJsonObjectOptions(Builder builder) { + this.allowSingleQuotes = builder.allowSingleQuotes; + this.stripQuotesFromSingleStrings = builder.stripQuotesFromSingleStrings; + this.missingFieldsAsNulls = builder.missingFieldsAsNulls; + } + + public boolean isAllowSingleQuotes() { + return allowSingleQuotes; + } + + public boolean isStripQuotesFromSingleStrings() { + return stripQuotesFromSingleStrings; + } + + public boolean isMissingFieldsAsNulls() { + return missingFieldsAsNulls; + } + + public static Builder builder() { + return new Builder(); + } + + public static final class Builder { + private boolean allowSingleQuotes = false; + private boolean stripQuotesFromSingleStrings = true; + private boolean missingFieldsAsNulls = false; + + public Builder allowSingleQuotes(boolean allowSingleQuotes) { + this.allowSingleQuotes = allowSingleQuotes; + return this; + } + + public Builder stripQuotesFromSingleStrings(boolean stripQuotesFromSingleStrings) { + this.stripQuotesFromSingleStrings = stripQuotesFromSingleStrings; + return this; + } + + public Builder missingFieldsAsNulls(boolean missingFieldsAsNulls) { + this.missingFieldsAsNulls = missingFieldsAsNulls; + return this; + } + + public GetJsonObjectOptions build() { + return new GetJsonObjectOptions(this); + } + } +} diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index 47dc802cd49..1c4eb8a83ab 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -2436,9 +2436,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv CATCH_STD(env, 0) } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, - jlong j_view_handle, - jlong j_scalar_handle) { +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject( + JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle, jboolean allow_single_quotes, + jboolean strip_quotes_from_single_strings, jboolean missing_fields_as_nulls) { JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); @@ -2448,7 +2448,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env cudf::column_view *n_column_view = reinterpret_cast(j_view_handle); cudf::strings_column_view n_strings_col_view(*n_column_view); cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); - return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path)); + auto options = cudf::get_json_object_options{}; + options.set_allow_single_quotes(allow_single_quotes); + options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings); + options.set_missing_fields_as_nulls(missing_fields_as_nulls); + return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options)); } CATCH_STD(env, 0) } diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index dfead3716ee..75573046af2 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -6379,6 +6379,7 @@ void testGetJSONObject() { " }\n" + "}"; + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + @@ -6389,6 +6390,21 @@ void testGetJSONObject() { } } + @Test + void testGetJSONObjectWithSingleQuotes() { + String jsonString = "{" + + "\'a\': \'A\"\'" + + "}"; + + GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build(); + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("A\"", "A\""); + Scalar path = Scalar.fromString("$.a"); + ColumnVector gotAuthors = json.getJSONObject(path, options)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } +} + @Test void testMakeStructEmpty() { final int numRows = 10;