Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose delimiter character in JSON reader options to JSON reader APIs #17266

Merged
merged 10 commits into from
Nov 9, 2024
8 changes: 5 additions & 3 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,13 @@ void write_json(data_sink* sink,
/**
* @brief Normalize single quotes to double quotes using FST
*
* @param indata Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
* @param indata Input device buffer
* @param delimiter Line-separating delimiter character in JSONL inputs
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
char delimiter,
shrshi marked this conversation as resolved.
Show resolved Hide resolved
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);

Expand Down
50 changes: 27 additions & 23 deletions cpp/src/io/json/json_normalization.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
SINGLE_QUOTE_CHAR, ///< Quote character SG: '
ESCAPE_CHAR, ///< Escape character SG: '\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
DELIM_CHAR, ///< Delimiter character SG
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
Expand All @@ -72,13 +72,18 @@ constexpr auto TT_SEC = dfa_states::TT_SEC;
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);

// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const qna_sgs{
{{'\"'}, {'\''}, {'\\'}, {'\n'}}};
template <typename SymbolT>
auto get_sgid_lut(SymbolT delim)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
{
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
{{'\"'}, {'\''}, {'\\'}, {delim}}};
return symbol_groups;
}

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_state_tt{{
/* IN_STATE " ' \ \n OTHER */
/* IN_STATE " ' \ <delim> OTHER */
/* TT_OOS */ {{TT_DQS, TT_SQS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DQS, TT_DEC, TT_OOS, TT_DQS}},
/* TT_SQS */ {{TT_SQS, TT_OOS, TT_SEC, TT_OOS, TT_SQS}},
Expand Down Expand Up @@ -199,28 +204,26 @@ struct TransduceToNormalizedQuotes {

namespace normalize_whitespace {

// We do not need a symbol group for the delimiter character since whitespace normalization
// now occurs after tokenization.
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
ESCAPE_CHAR, ///< Escape character SG: '\\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' '
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
// Alias for readability of symbol group ids
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
{{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};

std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{{{'"'}, {'\\'}, {' ', '\t'}}};

/**
* -------- FST states ---------
* -----------------------------
* TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
* | quotes as well as any other character not enclosed by a string. Also handles
* | newline character present within a string
* TT_DQS | Double-quoted string state handling all characters within double quotes except
* | newline character
* | quotes as well as any other character not enclosed by a string.
* TT_DQS | Double-quoted string state handling all characters within double quotes
* TT_DEC | State handling escaped characters inside double-quoted string. Note that this
* | state is necessary to process escaped double-quote characters. Without this
* | state, whitespaces following escaped double quotes inside strings may be removed.
Expand All @@ -235,10 +238,10 @@ constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
{/* IN_STATE " \ \n <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};
{/* IN_STATE " \ <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};

// The DFA's starting state
constexpr StateT start_state = static_cast<StateT>(TT_OOS);
Expand Down Expand Up @@ -302,18 +305,19 @@ struct TransduceToNormalizedWS {
namespace detail {

void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
char delimiter,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
static constexpr std::int32_t min_out = 0;
static constexpr std::int32_t max_out = 2;
auto parser =
fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
normalize_quotes::TransduceToNormalizedQuotes{}),
stream);
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_quotes::get_sgid_lut(delimiter)),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
normalize_quotes::TransduceToNormalizedQuotes{}),
stream);

rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
cudf::detail::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/io/json/read_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,8 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
// If input JSON buffer has single quotes and option to normalize single quotes is enabled,
// invoke pre-processing FST
if (reader_opts.is_enabled_normalize_single_quotes()) {
normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
normalize_single_quotes(
bufview, reader_opts.get_delimiter(), stream, cudf::get_current_device_resource_ref());
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
}

auto buffer =
Expand Down
21 changes: 16 additions & 5 deletions cpp/tests/io/json/json_quote_normalization_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@
// Base test fixture for tests
struct JsonNormalizationTest : public cudf::test::BaseFixture {};

void run_test(std::string const& host_input, std::string const& expected_host_output)
void run_test(std::string const& host_input,
std::string const& expected_host_output,
char delimiter = '\n')
{
// RMM memory resource
std::shared_ptr<rmm::mr::device_memory_resource> rsc =
Expand All @@ -46,7 +48,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou

// Preprocessing FST
cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
cudf::io::json::detail::normalize_single_quotes(device_data, delimiter, stream_view, rsc.get());

std::string preprocessed_host_output(device_data.size(), 0);
CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
Expand Down Expand Up @@ -172,29 +174,38 @@ TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_Invalid_WrongBraces
run_test(input, output);
}

TEST_F(JsonNormalizationTest, GroundTruth_QuoteNormalization_NonNewlineDelimiter)
{
std::string input{"{\"a\": \"1\n2\"}z{\'a\': 12}"};
std::string output{"{\"a\": \"1\n2\"}z{\"a\": 12}"};
run_test(input, output, 'z');
}

TEST_F(JsonNormalizationTest, ReadJsonOption)
{
// RMM memory resource
std::shared_ptr<rmm::mr::device_memory_resource> rsc =
std::make_shared<rmm::mr::cuda_memory_resource>();

// Test input
std::string const host_input = R"({"A":'TEST"'})";
std::string const host_input = R"({"a": "1\n2"}h{'a': 12})";
cudf::io::json_reader_options input_options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{host_input.data(), host_input.size()})
.lines(true)
.delimiter('h')
.normalize_single_quotes(true);

cudf::io::table_with_metadata processed_table =
cudf::io::read_json(input_options, cudf::test::get_default_stream(), rsc.get());

// Expected table
std::string const expected_input = R"({"A":"TEST\""})";
std::string const expected_input = R"({"a": "1\n2"}h{"a": 12})";
cudf::io::json_reader_options expected_input_options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{expected_input.data(), expected_input.size()})
.lines(true);
.lines(true)
.delimiter('h');

cudf::io::table_with_metadata expected_table =
cudf::io::read_json(expected_input_options, cudf::test::get_default_stream(), rsc.get());
shrshi marked this conversation as resolved.
Show resolved Hide resolved
Expand Down
Loading