diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 940d03cdb41..2e2ac43d6fe 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -57,11 +57,13 @@ void write_json(data_sink* sink, /** * @brief Normalize single quotes to double quotes using FST * - * @param indata Input device buffer - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource to use for device memory allocation + * @param indata Input device buffer + * @param delimiter Line-separating delimiter character in JSONL inputs + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource to use for device memory allocation */ void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 34a87918e57..1b61be20202 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " SINGLE_QUOTE_CHAR, ///< Quote character SG: ' ESCAPE_CHAR, ///< Escape character SG: '\' - NEWLINE_CHAR, ///< Newline character SG: '\n' + DELIM_CHAR, ///< Delimiter character SG OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -72,13 +72,17 @@ constexpr auto TT_SEC = dfa_states::TT_SEC; constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const qna_sgs{ - {{'\"'}, {'\''}, {'\\'}, {'\n'}}}; +auto get_sgid_lut(SymbolT delim) +{ + // The i-th string representing all the characters of a symbol group + std::array, NUM_SYMBOL_GROUPS - 1> symbol_groups{ + {{'\"'}, {'\''}, {'\\'}, {delim}}}; + return symbol_groups; +} // Transition table std::array, TT_NUM_STATES> const qna_state_tt{{ - /* IN_STATE " ' \ \n OTHER */ + /* IN_STATE " ' \ OTHER */ /* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}}, /* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}}, /* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}}, @@ -199,28 +203,26 @@ struct TransduceToNormalizedQuotes { namespace normalize_whitespace { +// We do not need a symbol group for the delimiter character since whitespace normalization +// now occurs after tokenization. enum class dfa_symbol_group_id : uint32_t { DOUBLE_QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\\' - NEWLINE_CHAR, ///< Newline character SG: '\n' WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' ' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; // Alias for readability of symbol group ids constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); -// The i-th string representing all the characters of a symbol group -std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{ - {{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}}; + +std::array, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}}; /** * -------- FST states --------- * ----------------------------- * TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double - * | quotes as well as any other character not enclosed by a string. Also handles - * | newline character present within a string - * TT_DQS | Double-quoted string state handling all characters within double quotes except - * | newline character + * | quotes as well as any other character not enclosed by a string. + * TT_DQS | Double-quoted string state handling all characters within double quotes * TT_DEC | State handling escaped characters inside double-quoted string. Note that this * | state is necessary to process escaped double-quote characters. Without this * | state, whitespaces following escaped double quotes inside strings may be removed. @@ -235,10 +237,10 @@ constexpr auto TT_NUM_STATES = static_cast(dfa_states::TT_NUM_STATES); // Transition table std::array, TT_NUM_STATES> const wna_state_tt{ - {/* IN_STATE " \ \n OTHER */ - /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}}, - /* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}}, - /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; + {/* IN_STATE " \ OTHER */ + /* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}}, + /* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}}; // The DFA's starting state constexpr StateT start_state = static_cast(TT_OOS); @@ -302,18 +304,19 @@ struct TransduceToNormalizedWS { namespace detail { void normalize_single_quotes(datasource::owning_buffer& indata, + char delimiter, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; - auto parser = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs), - fst::detail::make_transition_table(normalize_quotes::qna_state_tt), - fst::detail::make_translation_functor( - normalize_quotes::TransduceToNormalizedQuotes{}), - stream); + auto parser = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)), + fst::detail::make_transition_table(normalize_quotes::qna_state_tt), + fst::detail::make_translation_functor( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); cudf::detail::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 2bc15ea19cb..279f5e71351 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span> sources, // If input JSON buffer has single quotes and option to normalize single quotes is enabled, // invoke pre-processing FST if (reader_opts.is_enabled_normalize_single_quotes()) { - normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref()); + normalize_single_quotes( + bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref()); } auto buffer = diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index c8c2d18903f..0fbd7da7f4d 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -34,7 +34,9 @@ // Base test fixture for tests struct JsonNormalizationTest : public cudf::test::BaseFixture {}; -void run_test(std::string const& host_input, std::string const& expected_host_output) +void run_test(std::string const& host_input, + std::string const& expected_host_output, + char delimiter = '\n') { // RMM memory resource std::shared_ptr rsc = @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Preprocessing FST cudf::io::datasource::owning_buffer device_data(std::move(device_input)); - cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); + cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(), @@ -172,6 +174,13 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces run_test(input, output); } +TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter) +{ + std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"}; + std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"}; + run_test(input, output, 'z'); +} + TEST_F(JsonNormalizationTest, ReadJsonOption) { // RMM memory resource @@ -179,22 +188,24 @@ TEST_F(JsonNormalizationTest, ReadJsonOption) std::make_shared(); // Test input - std::string const host_input = R"({"A":'TEST"'})"; + std::string const host_input = R"({"a": "1\n2"}h{'a': 12})"; cudf::io::json_reader_options input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{host_input.data(), host_input.size()}) .lines(true) + .delimiter('h') .normalize_single_quotes(true); cudf::io::table_with_metadata processed_table = cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get()); // Expected table - std::string const expected_input = R"({"A":"TEST\""})"; + std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})"; cudf::io::json_reader_options expected_input_options = cudf::io::json_reader_options::builder( cudf::io::source_info{expected_input.data(), expected_input.size()}) - .lines(true); + .lines(true) + .delimiter('h'); cudf::io::table_with_metadata expected_table = cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());