From d158ccdbe651952bd649cb0f17c41467c5209824 Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 4 Mar 2024 15:25:51 -0500 Subject: [PATCH] API for JSON unquoted whitespace normalization (#15033) This work is a follow-up to PR #14931 which provided a proof-of-concept for using the a FST to normalize unquoted whitespaces. This PR implements the pre-processing FST in cuIO and adds a JSON reader option that needs to be set to true to invoke the normalizer. Addresses feature request #14865 Authors: - Shruti Shivakumar (https://github.com/shrshi) - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Vukasin Milovanovic (https://github.com/vuule) - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/15033 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/io/detail/json.hpp | 10 + cpp/include/cudf/io/json.hpp | 31 +++ ...normalization.cu => json_normalization.cu} | 142 ++++++++++++- cpp/src/io/json/read_json.cu | 7 + .../io/json_whitespace_normalization_test.cu | 201 ++++-------------- .../main/java/ai/rapids/cudf/JSONOptions.java | 15 ++ java/src/main/java/ai/rapids/cudf/Table.java | 9 + java/src/main/native/src/TableJni.cpp | 27 ++- .../test/java/ai/rapids/cudf/TableTest.java | 49 +++-- java/src/test/resources/whitespaces.json | 5 + 11 files changed, 314 insertions(+), 184 deletions(-) rename cpp/src/io/json/{json_quote_normalization.cu => json_normalization.cu} (57%) create mode 100644 java/src/test/resources/whitespaces.json diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5fd6cd3544a..c74963be50d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -376,7 +376,7 @@ add_library( src/io/functions.cpp src/io/json/byte_range_info.cu src/io/json/json_column.cu - src/io/json/json_quote_normalization.cu + src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 0eb0e17ea10..3f7f7e9bb32 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -63,4 +63,14 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& in rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Normalize unquoted whitespace (space and tab characters) using FST + * + * @param inbuf Input device buffer + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation + */ +rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace cudf::io::json::detail diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index f0c3d48ab7e..593dd044d51 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -118,6 +118,9 @@ class json_reader_options { // Normalize single quotes bool _normalize_single_quotes = false; + // Normalize unquoted spaces and tabs + bool _normalize_whitespace = false; + // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; @@ -265,6 +268,13 @@ class json_reader_options { */ bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; } + /** + * @brief Whether the reader should normalize unquoted whitespace characters + * + * @returns true if the reader should normalize whitespace, false otherwise + */ + bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; } + /** * @brief Queries the JSON reader's behavior on invalid JSON lines. * @@ -358,6 +368,14 @@ class json_reader_options { */ void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; } + /** + * @brief Set whether the reader should enable normalization of unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted whitespace + * characters i.e. tabs and spaces + */ + void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * @@ -533,6 +551,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the reader should normalize unquoted whitespace + * + * @param val Boolean value to indicate whether the reader should normalize unquoted + * whitespace + * @return this for chaining + */ + json_reader_options_builder& normalize_whitespace(bool val) + { + options._normalize_whitespace = val; + return *this; + } + /** * @brief Specifies the JSON reader's behavior on invalid JSON lines. * diff --git a/cpp/src/io/json/json_quote_normalization.cu b/cpp/src/io/json/json_normalization.cu similarity index 57% rename from cpp/src/io/json/json_quote_normalization.cu rename to cpp/src/io/json/json_normalization.cu index a13b6e0b016..86e4da664a8 100644 --- a/cpp/src/io/json/json_quote_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -32,13 +32,15 @@ namespace cudf::io::json { -using SymbolT = char; -using StateT = char; +// Type used to represent the atomic symbol type used within the finite-state machine +using SymbolT = char; +using StateT = char; + +// Type sufficiently large to index symbols within the input and output (may be unsigned) using SymbolOffsetT = uint32_t; namespace normalize_quotes { -// Type sufficiently large to index symbols within the input and output (may be unsigned) enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES }; enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " @@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes { } // namespace normalize_quotes +namespace normalize_whitespace { + +enum class dfa_symbol_group_id : uint32_t { + DOUBLE_QUOTE_CHAR, ///< Quote character SG: " + ESCAPE_CHAR, ///< Escape character SG: '\\' + NEWLINE_CHAR, ///< Newline character SG: '\n' + WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' + OTHER_SYMBOLS, ///< SG implicitly matching all other characters + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; +// Alias for readability of symbol group ids +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); +// The i-th string representing all the characters of a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ + {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +/** + * -------- FST states --------- + * ----------------------------- + * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double + * | quotes as well as any other character not enclosed by a string. Also handles + * | newline character present within a string + * TT_DQS | Double-quoted string state handling all characters within double quotes except + * | newline character + * TT_DEC | State handling escaped characters inside double-quoted string. Note that this + * | state is necessary to process escaped double-quote characters. Without this + * | state, whitespaces following escaped double quotes inside strings may be removed. + * + * NOTE: An important case NOT handled by this FST is that of whitespace following newline + * characters within a string. Consider the following example + * Input: {"a":"x\n y"} + * FST output: {"a":"x\ny"} + * Expected output: {"a":"x\n y"} + * Such strings are not part of the JSON standard (characters allowed within quotes should + * have ASCII at least 0x20 i.e. space character and above) but may be encountered while + * reading JSON files + */ +enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; +// Aliases for readability of the transition table +constexpr auto TT_OOS = dfa_states::TT_OOS; +constexpr auto TT_DQS = dfa_states::TT_DQS; +constexpr auto TT_DEC = dfa_states::TT_DEC; +constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); + +// Transition table +std::array, TT_NUM_STATES> const wna_state_tt{ + {/* IN_STATE " \ \n OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + +// The DFA's starting state +constexpr StateT start_state = static_cast(TT_OOS); + +struct TransduceToNormalizedWS { + /** + * @brief Returns the -th output symbol on the transition (state_id, match_id). + */ + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + // -------- TRANSLATION TABLE ------------ + // Let the alphabet set be Sigma + // --------------------------------------- + // ---------- NON-SPECIAL CASES: ---------- + // Output symbol same as input symbol + // state | read_symbol -> output_symbol + // DQS | Sigma -> Sigma + // OOS | Sigma\{,\t} -> Sigma\{,\t} + // DEC | Sigma -> Sigma + // ---------- SPECIAL CASES: -------------- + // Input symbol translates to output symbol + // OOS | {} -> + // OOS | {\t} -> + + // Case when read symbol is a space or tab but is unquoted + // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function + // However, since there is no output in this case i.e. the count returned by + // operator()(state_id, match_id, read_symbol) is zero, this function is never called. + // So skipping the check for this case. + + // In all other cases, we have an output symbol for the input symbol. + // We simply output the input symbol + return read_symbol; + } + + /** + * @brief Returns the number of output characters for a given transition. + * During whitespace normalization, we always emit one output character i.e., the input + * character, except when we need to remove the space/tab character + */ + template + constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Case when read symbol is a space or tab but is unquoted + if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && + state_id == static_cast(dfa_states::TT_OOS)) { + return 0; + } + return 1; + } +}; + +} // namespace normalize_whitespace + namespace detail { rmm::device_uvector normalize_single_quotes(rmm::device_uvector&& inbuf, @@ -198,5 +310,29 @@ rmm::device_uvector normalize_single_quotes(rmm::device_uvector normalize_whitespace(rmm::device_uvector&& inbuf, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), + fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), + fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}), + stream); + + rmm::device_uvector outbuf(inbuf.size(), stream, mr); + rmm::device_scalar outbuf_size(stream, mr); + parser.Transduce(inbuf.data(), + static_cast(inbuf.size()), + outbuf.data(), + thrust::make_discard_iterator(), + outbuf_size.data(), + normalize_whitespace::start_state, + stream); + + outbuf.resize(outbuf_size.value(stream), stream); + return outbuf; +} + } // namespace detail } // namespace cudf::io::json diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index ba8acf2d47a..506d7b6cddc 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -235,6 +235,13 @@ table_with_metadata read_json(host_span> sources, normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource()); } + // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is + // enabled, invoke pre-processing FST + if (reader_opts.is_enabled_normalize_whitespace()) { + buffer = + normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource()); + } + return device_parse_nested_json(buffer, reader_opts, stream, mr); // For debug purposes, use host_parse_nested_json() } diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu index 545d8d2c4f9..336d360063f 100644 --- a/cpp/tests/io/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json_whitespace_normalization_test.cu @@ -13,177 +13,41 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "io/fst/lookup_tables.cuh" -#include "io/utilities/hostdevice_vector.hpp" - #include #include -#include +#include #include -#include +#include +#include +#include #include +#include -#include #include +#include -#include - -#include #include -namespace { -// Type used to represent the atomic symbol type used within the finite-state machine -using SymbolT = char; -using StateT = char; - -// Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; - -enum class dfa_symbol_group_id : uint32_t { - DOUBLE_QUOTE_CHAR, ///< Quote character SG: " - ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' - WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' - OTHER_SYMBOLS, ///< SG implicitly matching all other characters - NUM_SYMBOL_GROUPS ///< Total number of symbol groups -}; -// Alias for readability of symbol group ids -constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; - -/** - * -------- FST states --------- - * ----------------------------- - * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character - * TT_DEC | State handling escaped characters inside double-quoted string. Note that this - * | state is necessary to process escaped double-quote characters. Without this - * | state, whitespaces following escaped double quotes inside strings may be removed. - * - * NOTE: An important case NOT handled by this FST is that of whitespace following newline - * characters within a string. Consider the following example - * Input: {"a":"x\n y"} - * FST output: {"a":"x\ny"} - * Expected output: {"a":"x\n y"} - */ -enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES }; -// Aliases for readability of the transition table -constexpr auto TT_OOS = dfa_states::TT_OOS; -constexpr auto TT_DQS = dfa_states::TT_DQS; -constexpr auto TT_DEC = dfa_states::TT_DEC; -constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); - -// Transition table -std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; - -// The DFA's starting state -constexpr StateT start_state = static_cast(TT_OOS); - -struct TransduceToNormalizedWS { - /** - * @brief Returns the -th output symbol on the transition (state_id, match_id). - */ - template - constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id, - SymbolGroupT const match_id, - RelativeOffsetT const relative_offset, - SymbolT const read_symbol) const - { - // -------- TRANSLATION TABLE ------------ - // Let the alphabet set be Sigma - // --------------------------------------- - // ---------- NON-SPECIAL CASES: ---------- - // Output symbol same as input symbol - // state | read_symbol -> output_symbol - // DQS | Sigma -> Sigma - // OOS | Sigma\{,\t} -> Sigma\{,\t} - // DEC | Sigma -> Sigma - // ---------- SPECIAL CASES: -------------- - // Input symbol translates to output symbol - // OOS | {} -> - // OOS | {\t} -> - - // Case when read symbol is a space or tab but is unquoted - // This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function - // However, since there is no output in this case i.e. the count returned by - // operator()(state_id, match_id, read_symbol) is zero, this function is never called. - // So skipping the check for this case. - - // In all other cases, we have an output symbol for the input symbol. - // We simply output the input symbol - return read_symbol; - } - - /** - * @brief Returns the number of output characters for a given transition. - * During whitespace normalization, we always emit one output character i.e., the input - * character, except when we need to remove the space/tab character - */ - template - constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id, - SymbolGroupT const match_id, - SymbolT const read_symbol) const - { - // Case when read symbol is a space or tab but is unquoted - if (match_id == static_cast(dfa_symbol_group_id::WHITESPACE_SYMBOLS) && - state_id == static_cast(dfa_states::TT_OOS)) { - return 0; - } - return 1; - } -}; -} // namespace - // Base test fixture for tests struct JsonWSNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& input, std::string const& output) +void run_test(std::string const& host_input, std::string const& expected_host_output) { - auto parser = cudf::io::fst::detail::make_fst( - cudf::io::fst::detail::make_symbol_group_lut(wna_sgs), - cudf::io::fst::detail::make_transition_table(wna_state_tt), - cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedWS{}), - cudf::test::get_default_stream()); - - auto d_input_scalar = cudf::make_string_scalar(input, cudf::test::get_default_stream()); - auto& d_input = static_cast&>(*d_input_scalar); + auto stream_view = cudf::get_default_stream(); + auto device_input = cudf::detail::make_device_uvector_async( + host_input, stream_view, rmm::mr::get_current_device_resource()); - // Prepare input & output buffers - constexpr std::size_t single_item = 1; - cudf::detail::hostdevice_vector output_gpu(input.size(), - cudf::test::get_default_stream()); - cudf::detail::hostdevice_vector output_gpu_size(single_item, - cudf::test::get_default_stream()); + // Preprocessing FST + auto device_fst_output = cudf::io::json::detail::normalize_whitespace( + std::move(device_input), stream_view, rmm::mr::get_current_device_resource()); - // Allocate device-side temporary storage & run algorithm - parser.Transduce(d_input.data(), - static_cast(d_input.size()), - output_gpu.device_ptr(), - thrust::make_discard_iterator(), - output_gpu_size.device_ptr(), - start_state, - cudf::test::get_default_stream()); + auto const preprocessed_host_output = + cudf::detail::make_std_vector_sync(device_fst_output, stream_view); - // Async copy results from device to host - output_gpu.device_to_host_async(cudf::test::get_default_stream()); - output_gpu_size.device_to_host_async(cudf::test::get_default_stream()); - - // Make sure results have been copied back to host - cudf::test::get_default_stream().synchronize(); - - // Verify results - ASSERT_EQ(output_gpu_size[0], output.size()); - CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size()); + ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL( + preprocessed_host_output, expected_host_output, preprocessed_host_output.size()); } TEST_F(JsonWSNormalizationTest, GroundTruth_Spaces) @@ -259,4 +123,33 @@ TEST_F(JsonWSNormalizationTest, GroundTruth_InvalidInput) run_test(input, output); } +TEST_F(JsonWSNormalizationTest, ReadJsonOption) +{ + // When mixed type fields are read as strings, the table read will differ depending the + // value of normalize_whitespace + + // Test input + std::string const host_input = "{ \"a\" : {\"b\" :\t\"c\"}}"; + cudf::io::json_reader_options input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{host_input.data(), host_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(true); + + cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options); + + // Expected table + std::string const expected_input = R"({ "a" : {"b":"c"}})"; + cudf::io::json_reader_options expected_input_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{expected_input.data(), expected_input.size()}) + .lines(true) + .mixed_types_as_string(true) + .normalize_whitespace(false); + + cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.tbl->view(), processed_table.tbl->view()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 62496e32f7a..b37d0d88ec9 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -31,6 +31,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean lines; private final boolean recoverWithNull; private final boolean normalizeSingleQuotes; + private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; @@ -40,6 +41,7 @@ private JSONOptions(Builder builder) { lines = builder.lines; recoverWithNull = builder.recoverWithNull; normalizeSingleQuotes = builder.normalizeSingleQuotes; + normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; } @@ -61,6 +63,10 @@ public boolean isNormalizeSingleQuotes() { return normalizeSingleQuotes; } + public boolean isNormalizeWhitespace() { + return normalizeWhitespace; + } + public boolean isMixedTypesAsStrings() { return mixedTypesAsStrings; } @@ -84,6 +90,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1461,8 +1462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, - jboolean recover_with_null, jboolean normalize_single_quotes, jboolean mixed_types_as_string, - jboolean keep_quotes) { + jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace, + jboolean mixed_types_as_string, jboolean keep_quotes) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1484,8 +1485,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) - .keep_quotes(keep_quotes) - .mixed_types_as_string(mixed_types_as_string); + .normalize_whitespace(static_cast(normalize_whitespace)) + .mixed_types_as_string(mixed_types_as_string) + .keep_quotes(keep_quotes); auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1573,8 +1575,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes, - jlong ds_handle) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1606,6 +1608,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1646,7 +1649,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types, jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines, jboolean recover_with_null, - jboolean normalize_single_quotes, jboolean mixed_types_as_string, jboolean keep_quotes) { + jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, + jboolean keep_quotes) { bool read_buffer = true; if (buffer == 0) { @@ -1693,6 +1697,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON( .lines(static_cast(lines)) .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) + .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index bee8d1cbb88..3f0470d854a 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -88,6 +88,7 @@ public class TableTest extends CudfTestBase { private static final File TEST_SIMPLE_JSON_FILE = TestUtils.getResourceAsFile("people.json"); private static final File TEST_JSON_ERROR_FILE = TestUtils.getResourceAsFile("people_with_invalid_lines.json"); private static final File TEST_JSON_SINGLE_QUOTES_FILE = TestUtils.getResourceAsFile("single_quotes.json"); + private static final File TEST_JSON_WHITESPACES_FILE = TestUtils.getResourceAsFile("whitespaces.json"); private static final File TEST_MIXED_TYPE_1_JSON = TestUtils.getResourceAsFile("mixed_types_1.json"); private static final File TEST_MIXED_TYPE_2_JSON = TestUtils.getResourceAsFile("mixed_types_2.json"); @@ -349,6 +350,39 @@ void testReadSingleQuotesJSONFile() throws IOException { } @Test + void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "A") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withNormalizeSingleQuotes(false) + .build(); + try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { + assertThrows(CudfException.class, () -> + Table.readJSON(schema, opts, source)); + } + } + + @Test + void testReadWhitespacesJSONFile() throws IOException { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withLines(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column("b", "50", "[1,2,3,4,5,6,7,8]", "{\"c\":\"d\"}", "b") + .build(); + MultiBufferDataSource source = sourceFrom(TEST_JSON_WHITESPACES_FILE); + Table table = Table.readJSON(schema, opts, source)) { + assertTablesAreEqual(expected, table); + } + } + void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -547,21 +581,6 @@ void testReadMixedType2JSONFile() throws IOException { } } - @Test - void testReadSingleQuotesJSONFileFeatureDisabled() throws IOException { - Schema schema = Schema.builder() - .column(DType.STRING, "A") - .build(); - JSONOptions opts = JSONOptions.builder() - .withLines(true) - .withNormalizeSingleQuotes(false) - .build(); - try (MultiBufferDataSource source = sourceFrom(TEST_JSON_SINGLE_QUOTES_FILE)) { - assertThrows(CudfException.class, () -> - Table.readJSON(schema, opts, source)); - } - } - @Test void testReadJSONFromDataSource() throws IOException { Schema schema = Schema.builder() diff --git a/java/src/test/resources/whitespaces.json b/java/src/test/resources/whitespaces.json new file mode 100644 index 00000000000..f5ddd8cde5f --- /dev/null +++ b/java/src/test/resources/whitespaces.json @@ -0,0 +1,5 @@ +{"a":"b"} + { "a" : "50" } +{"a": [1, 2, 3, 4, 5, 6, 7, 8]} +{"a": {"c": "d"}} +{"a": "b"}