diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp index d74dfd57c8..2c4b8e1aaa 100644 --- a/src/main/cpp/src/JSONUtilsJni.cpp +++ b/src/main/cpp/src/JSONUtilsJni.cpp @@ -222,15 +222,15 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env, } JNIEXPORT jlong JNICALL -Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, - jclass, - jlong j_input, - jintArray j_num_children, - jintArray j_types, - jintArray j_scales, - jintArray j_precisions, - jboolean allow_nonnumeric_numbers, - jboolean is_us_locale) +Java_com_nvidia_spark_rapids_jni_JSONUtils_convertFromStrings(JNIEnv* env, + jclass, + jlong j_input, + jintArray j_num_children, + jintArray j_types, + jintArray j_scales, + jintArray j_precisions, + jboolean allow_nonnumeric_numbers, + jboolean is_us_locale) { JNI_NULL_CHECK(env, j_input, "j_input is null", 0); JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0); @@ -253,13 +253,13 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env, CUDF_EXPECTS(num_children.size() == precisions.size(), "Invalid schema data: precisions."); return cudf::jni::ptr_as_jlong( - spark_rapids_jni::convert_data_type(cudf::strings_column_view{*input_cv}, - num_children, - types, - scales, - precisions, - allow_nonnumeric_numbers, - is_us_locale) + spark_rapids_jni::convert_from_strings(cudf::strings_column_view{*input_cv}, + num_children, + types, + scales, + precisions, + allow_nonnumeric_numbers, + is_us_locale) .release()); } CATCH_STD(env, 0); diff --git a/src/main/cpp/src/from_json_to_structs.cu b/src/main/cpp/src/from_json_to_structs.cu index 2dc827ad29..98ce1e485b 100644 --- a/src/main/cpp/src/from_json_to_structs.cu +++ b/src/main/cpp/src/from_json_to_structs.cu @@ -260,11 +260,11 @@ std::unique_ptr cast_strings_to_integers(cudf::column_view const& mr); } + // Build a new strings column, removing the invalid rows. auto chars_data = cudf::strings::detail::make_chars_buffer( offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr); - // Don't care about the null mask, as nulls imply empty strings, which will also result in - // nulls. + // Don't care about the null mask, as nulls imply empty strings, which will also result in nulls. auto const sanitized_input = cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {}); @@ -345,8 +345,7 @@ std::pair, bool> try_remove_quotes_for_floats( auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - // If the output has the same total bytes, the input should not be changed. - // That is because when removing quotes, we always reduce the number of characters. + // If the output has the same total bytes, the output should be the same as the input. if (bytes == input_sv.chars_size(stream)) { return {nullptr, false}; } auto chars_data = cudf::strings::detail::make_chars_buffer( @@ -372,14 +371,16 @@ std::unique_ptr cast_strings_to_floats(cudf::column_view const& in if (string_count == 0) { return cudf::make_empty_column(output_type); } if (allow_nonnumeric_numbers) { + // Non-numeric numbers are always quoted. auto const [removed_quotes, success] = try_remove_quotes_for_floats(input, stream, mr); return string_to_float(output_type, cudf::strings_column_view{success ? removed_quotes->view() : input}, - false, + /*ansi_mode*/ false, stream, mr); } - return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr); + return string_to_float( + output_type, cudf::strings_column_view{input}, /*ansi_mode*/ false, stream, mr); } // TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898 @@ -480,7 +481,7 @@ std::unique_ptr cast_strings_to_decimals(cudf::column_view const& cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); auto chars_data = rmm::device_uvector(bytes, stream, mr); - // Since the strings store decimal numbers, they should be very short. + // Since the strings store decimal numbers, they should not be very long. // As such, using one thread per string should be good. thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), @@ -574,8 +575,7 @@ std::pair, bool> try_remove_quotes( auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr); - // If the output has the same total bytes, the input should not be changed. - // That is because when removing quotes, we always reduce the number of characters. + // If the output has the same total bytes, the output should be the same as the input. if (bytes == input.chars_size(stream)) { return {nullptr, false}; } auto chars_data = cudf::strings::detail::make_chars_buffer( @@ -910,15 +910,15 @@ std::unique_ptr from_json_to_structs(cudf::strings_column_view con mr); } -std::unique_ptr convert_data_type(cudf::strings_column_view const& input, - std::vector const& num_children, - std::vector const& types, - std::vector const& scales, - std::vector const& precisions, - bool allow_nonnumeric_numbers, - bool is_us_locale, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr convert_from_strings(cudf::strings_column_view const& input, + std::vector const& num_children, + std::vector const& types, + std::vector const& scales, + std::vector const& precisions, + bool allow_nonnumeric_numbers, + bool is_us_locale, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); diff --git a/src/main/cpp/src/json_utils.hpp b/src/main/cpp/src/json_utils.hpp index 641e9b839f..319c81103d 100644 --- a/src/main/cpp/src/json_utils.hpp +++ b/src/main/cpp/src/json_utils.hpp @@ -60,11 +60,11 @@ std::unique_ptr from_json_to_structs( rmm::device_async_resource_ref mr = cudf::get_current_device_resource()); /** - * @brief Convert the input strings column into a desired type given by a data schema. + * @brief Convert from a strings column to a column with the desired type given by a data schema. * * The given column schema is specified as data arrays flattened by depth-first-search order. */ -std::unique_ptr convert_data_type( +std::unique_ptr convert_from_strings( cudf::strings_column_view const& input, std::vector const& num_children, std::vector const& types, diff --git a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java index e8820c162c..9cf00acff2 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java @@ -167,11 +167,22 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input, JSONOpt /** * Parse a JSON string into a struct column following by the given data schema. + *

+ * Many JSON options in the given {@code opts} parameter are ignored from passing down to the + * native code. That is because these options are hard-coded with the same values in both the + * plugin code and native code. Specifically:
+ * - {@code RecoverWithNull: true}
+ * - {@code MixedTypesAsStrings: true}
+ * - {@code NormalizeWhitespace: true}
+ * - {@code KeepQuotes: true}
+ * - {@code StrictValidation: true}
+ * - {@code Experimental: true} * * @param input The input strings column in which each row specifies a json object * @param schema The schema of the output struct column * @param opts The options for parsing JSON strings - * @param isUSLocale Whether the current local is US locale + * @param isUSLocale Whether the current local is US locale, used when converting strings to + * decimal types * @return A struct column in which each row is parsed from the corresponding json string */ public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JSONOptions opts, @@ -191,19 +202,21 @@ public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JS } /** - * Convert the data type of a strings column to the desired type given by a data schema. + * Convert from a strings column to a column with the desired type given by a data schema. * * @param input The input strings column * @param schema The schema of the output column - * @param allowedNonNumericNumbers Whether non-numeric numbers are allowed - * @param isUSLocale Whether the current local is US locale + * @param allowedNonNumericNumbers Whether non-numeric numbers are allowed, used when converting + * strings to float types + * @param isUSLocale Whether the current local is US locale, used when converting strings to + * decimal types * @return A column with the desired data type */ - public static ColumnVector convertDataType(ColumnView input, Schema schema, - boolean allowedNonNumericNumbers, - boolean isUSLocale) { + public static ColumnVector convertFromStrings(ColumnView input, Schema schema, + boolean allowedNonNumericNumbers, + boolean isUSLocale) { assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type"; - return new ColumnVector(convertDataType(input.getNativeView(), + return new ColumnVector(convertFromStrings(input.getNativeView(), schema.getFlattenedNumChildren(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), @@ -242,7 +255,6 @@ private static native long[] getJsonObjectMultiplePaths(long input, long memoryBudgetBytes, int parallelOverride); - private static native long extractRawMapFromJsonString(long input, boolean normalizeSingleQuotes, boolean leadingZerosAllowed, @@ -261,13 +273,13 @@ private static native long fromJSONToStructs(long input, boolean unquotedControlChars, boolean isUSLocale); - private static native long convertDataType(long input, - int[] numChildren, - int[] typeIds, - int[] typeScales, - int[] typePrecision, - boolean nonNumericNumbersAllowed, - boolean isUSLocale); + private static native long convertFromStrings(long input, + int[] numChildren, + int[] typeIds, + int[] typeScales, + int[] typePrecision, + boolean nonNumericNumbersAllowed, + boolean isUSLocale); private static native long removeQuotes(long input, boolean nullifyIfNotQuoted); }