Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API for JSON unquoted whitespace normalization #15033

Merged
merged 31 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ee6e82e
normalize_whitespace api
shrshi Feb 12, 2024
9ed0a78
easting some consts
shrshi Feb 13, 2024
b1e755e
java bindings
shrshi Feb 13, 2024
15e6f4a
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 13, 2024
19e0428
style fix
shrshi Feb 13, 2024
878d50d
more style fix
shrshi Feb 13, 2024
68083fc
fixing context for quoted newline comment
shrshi Feb 13, 2024
bf9936d
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 14, 2024
b05e9c0
example fix
shrshi Feb 14, 2024
0f22b90
Merge branch 'json-whitespace' of github.com:shrshi/cudf into json-wh…
shrshi Feb 14, 2024
ec6a2b0
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 15, 2024
08f4c29
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 15, 2024
688d389
tests fix - default values for mr and stream; better input for ReadJs…
shrshi Feb 15, 2024
1c7111e
Merge branch 'json-whitespace' of github.com:shrshi/cudf into json-wh…
shrshi Feb 15, 2024
e6f23bf
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 15, 2024
86eb776
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 16, 2024
6db60ee
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 23, 2024
6d50364
PR reviews
shrshi Feb 23, 2024
2158940
Merge branch 'json-whitespace' of github.com:shrshi/cudf into json-wh…
shrshi Feb 23, 2024
3b91d32
Merge branch 'branch-24.04' into json-whitespace
vuule Feb 23, 2024
cbcb6e5
pr reviews
shrshi Feb 27, 2024
2b03dde
Merge branch 'json-whitespace' of github.com:shrshi/cudf into json-wh…
shrshi Feb 27, 2024
52c3711
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 27, 2024
057ba7c
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 27, 2024
3d185fe
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 28, 2024
f362211
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 29, 2024
07e05d0
Merge branch 'branch-24.04' into json-whitespace
shrshi Feb 29, 2024
8385c3c
Merge branch 'branch-24.04' into json-whitespace
shrshi Mar 1, 2024
98c103d
Merge branch 'branch-24.04' into json-whitespace
shrshi Mar 1, 2024
d4b1b26
Merge branch 'branch-24.04' into json-whitespace
shrshi Mar 1, 2024
ff91891
Merge branch 'branch-24.04' into json-whitespace
shrshi Mar 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ add_library(
src/io/functions.cpp
src/io/json/byte_range_info.cu
src/io/json/json_column.cu
src/io/json/json_quote_normalization.cu
src/io/json/json_normalization.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
Expand Down
10 changes: 10 additions & 0 deletions cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,14 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Normalize unquoted whitespace (space and tab characters) using FST
*
* @param inbuf Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);
} // namespace cudf::io::json::detail
31 changes: 31 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ class json_reader_options {
// Normalize single quotes
bool _normalize_single_quotes = false;

// Normalize unquoted spaces and tabs
bool _normalize_whitespace = false;

// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

Expand Down Expand Up @@ -265,6 +268,13 @@ class json_reader_options {
*/
bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }

/**
* @brief Whether the reader should normalize unquoted whitespace characters
*
* @returns true if the reader should normalize whitespace, false otherwise
*/
bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }

/**
* @brief Queries the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -358,6 +368,14 @@ class json_reader_options {
*/
void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }

/**
* @brief Set whether the reader should enable normalization of unquoted whitespace
*
* @param val Boolean value to indicate whether the reader should normalize unquoted whitespace
* characters i.e. tabs and spaces
*/
void enable_normalize_whitespace(bool val) { _normalize_whitespace = val; }

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -533,6 +551,19 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether the reader should normalize unquoted whitespace
*
* @param val Boolean value to indicate whether the reader should normalize unquoted
* whitespace
* @return this for chaining
*/
json_reader_options_builder& normalize_whitespace(bool val)
{
options._normalize_whitespace = val;
return *this;
}

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@

namespace cudf::io::json {

using SymbolT = char;
using StateT = char;
// Type used to represent the atomic symbol type used within the finite-state machine
using SymbolT = char;
using StateT = char;

// Type sufficiently large to index symbols within the input and output (may be unsigned)
using SymbolOffsetT = uint32_t;

namespace normalize_quotes {

// Type sufficiently large to index symbols within the input and output (may be unsigned)
enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
Expand Down Expand Up @@ -172,6 +174,116 @@ struct TransduceToNormalizedQuotes {

} // namespace normalize_quotes

namespace normalize_whitespace {

enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
ESCAPE_CHAR, ///< Escape character SG: '\\'
NEWLINE_CHAR, ///< Newline character SG: '\n'
WHITESPACE_SYMBOLS, ///< Whitespace characters SG: '\t' or ' '
OTHER_SYMBOLS, ///< SG implicitly matching all other characters
NUM_SYMBOL_GROUPS ///< Total number of symbol groups
};
// Alias for readability of symbol group ids
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
// The i-th string representing all the characters of a symbol group
std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> const wna_sgs{
{{'"'}, {'\\'}, {'\n'}, {' ', '\t'}}};

/**
* -------- FST states ---------
* -----------------------------
* TT_OOS | Out-of-string state handling whitespace and non-whitespace chars outside double
* | quotes as well as any other character not enclosed by a string. Also handles
* | newline character present within a string
* TT_DQS | Double-quoted string state handling all characters within double quotes except
* | newline character
* TT_DEC | State handling escaped characters inside double-quoted string. Note that this
* | state is necessary to process escaped double-quote characters. Without this
* | state, whitespaces following escaped double quotes inside strings may be removed.
*
* NOTE: An important case NOT handled by this FST is that of whitespace following newline
* characters within a string. Consider the following example
* Input: {"a":"x\n y"}
* FST output: {"a":"x\ny"}
* Expected output: {"a":"x\n y"}
* Such strings are not part of the JSON standard (characters allowed within quotes should
* have ASCII at least 0x20 i.e. space character and above) but may be encountered while
* reading JSON files
*/
enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_DEC, TT_NUM_STATES };
// Aliases for readability of the transition table
constexpr auto TT_OOS = dfa_states::TT_OOS;
constexpr auto TT_DQS = dfa_states::TT_DQS;
constexpr auto TT_DEC = dfa_states::TT_DEC;
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);

// Transition table
std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const wna_state_tt{
{/* IN_STATE " \ \n <SPC> OTHER */
/* TT_OOS */ {{TT_DQS, TT_OOS, TT_OOS, TT_OOS, TT_OOS}},
/* TT_DQS */ {{TT_OOS, TT_DEC, TT_OOS, TT_DQS, TT_DQS}},
/* TT_DEC */ {{TT_DQS, TT_DQS, TT_DQS, TT_DQS, TT_DQS}}}};

// The DFA's starting state
constexpr StateT start_state = static_cast<StateT>(TT_OOS);

struct TransduceToNormalizedWS {
/**
* @brief Returns the <relative_offset>-th output symbol on the transition (state_id, match_id).
*/
template <typename StateT, typename SymbolGroupT, typename RelativeOffsetT, typename SymbolT>
constexpr CUDF_HOST_DEVICE SymbolT operator()(StateT const state_id,
SymbolGroupT const match_id,
RelativeOffsetT const relative_offset,
SymbolT const read_symbol) const
{
// -------- TRANSLATION TABLE ------------
// Let the alphabet set be Sigma
// ---------------------------------------
// ---------- NON-SPECIAL CASES: ----------
// Output symbol same as input symbol <s>
// state | read_symbol <s> -> output_symbol <s>
// DQS | Sigma -> Sigma
// OOS | Sigma\{<SPC>,\t} -> Sigma\{<SPC>,\t}
// DEC | Sigma -> Sigma
// ---------- SPECIAL CASES: --------------
// Input symbol translates to output symbol
// OOS | {<SPC>} -> <nop>
// OOS | {\t} -> <nop>

// Case when read symbol is a space or tab but is unquoted
// This will be the same condition as in `operator()(state_id, match_id, read_symbol)` function
// However, since there is no output in this case i.e. the count returned by
// operator()(state_id, match_id, read_symbol) is zero, this function is never called.
// So skipping the check for this case.

// In all other cases, we have an output symbol for the input symbol.
// We simply output the input symbol
return read_symbol;
}

/**
* @brief Returns the number of output characters for a given transition.
* During whitespace normalization, we always emit one output character i.e., the input
* character, except when we need to remove the space/tab character
*/
template <typename StateT, typename SymbolGroupT, typename SymbolT>
constexpr CUDF_HOST_DEVICE uint32_t operator()(StateT const state_id,
SymbolGroupT const match_id,
SymbolT const read_symbol) const
{
// Case when read symbol is a space or tab but is unquoted
if (match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::WHITESPACE_SYMBOLS) &&
state_id == static_cast<StateT>(dfa_states::TT_OOS)) {
return 0;
}
return 1;
}
};

} // namespace normalize_whitespace

namespace detail {

rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
Expand All @@ -198,5 +310,29 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
return outbuf;
}

rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
stream);

rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
parser.Transduce(inbuf.data(),
static_cast<SymbolOffsetT>(inbuf.size()),
outbuf.data(),
thrust::make_discard_iterator(),
outbuf_size.data(),
normalize_whitespace::start_state,
stream);

outbuf.resize(outbuf_size.value(stream), stream);
return outbuf;
}

} // namespace detail
} // namespace cudf::io::json
7 changes: 7 additions & 0 deletions cpp/src/io/json/read_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
}

// If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
// enabled, invoke pre-processing FST
if (reader_opts.is_enabled_normalize_whitespace()) {
buffer =
normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
}

return device_parse_nested_json(buffer, reader_opts, stream, mr);
// For debug purposes, use host_parse_nested_json()
}
Expand Down
Loading
Loading