Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds JSON-token-stream to JSON-tree #11291

Closed
wants to merge 58 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
0557d41
squashed with bracket/brace test
elstehle Apr 11, 2022
355d1e4
clean up & addressing review comments
elstehle Apr 20, 2022
39a6b65
refactored lookup tables
elstehle Apr 25, 2022
239f138
put lookup tables into their own cudf file
elstehle Apr 25, 2022
39cff80
Change interface for FST to not need temp storage
elstehle Apr 27, 2022
e24a133
removing unused var post-cleanup
elstehle May 4, 2022
caf6195
unified usage of pragma unrolls
elstehle May 9, 2022
ea79a81
Adding hostdevice macros to in-reg array
elstehle May 9, 2022
17dcbfd
making const vars const
elstehle May 9, 2022
6fdd24a
refactor lut sanity check
elstehle May 9, 2022
eccf970
fixes sg-count & uses rmm stream in fst tests
elstehle Jun 2, 2022
9fe8e4b
minor doxygen fix
elstehle Jun 14, 2022
694a365
adopts suggested fst test changes
elstehle Jun 15, 2022
f656f49
adopts device-side test data gen
elstehle Jul 7, 2022
485a1c6
adopts c++17 namespaces declarations
elstehle Jul 9, 2022
5f1c4b5
removes state vector-wrapper in favor of vanilla array
elstehle Jul 11, 2022
e6f8def
some west-const remainders & unifies StateIndexT
elstehle Jul 11, 2022
a798852
adds check for state transition narrowing conversion
elstehle Jul 11, 2022
eb24962
fixes logical stack test includes
elstehle Jul 12, 2022
f52e614
replaces enum with typed constexpr
elstehle Jul 14, 2022
3038058
adds excplitis error checking
elstehle Jul 14, 2022
d351e5c
addresses style review comments & fixes a todo
elstehle Jul 14, 2022
3f47952
replaces gtest asserts with expects
elstehle Jul 14, 2022
cba1619
fixes style in dispatch dfa
elstehle Jul 14, 2022
bea2a02
replaces vanilla loop with iota
elstehle Jul 15, 2022
8a184e9
rephrases documentation on in-reg array
elstehle Jul 16, 2022
78dd893
Merge remote-tracking branch 'upstream/branch-22.08' into feature/fin…
elstehle Jul 16, 2022
9b20d16
Added utility to debug print & instrumented code to use it
elstehle Mar 31, 2022
b260610
switched to using rmm also inside algorithm
elstehle Mar 31, 2022
49fa996
renaming key-value store op to stack_op
elstehle Apr 4, 2022
24dab9e
device_span
elstehle Apr 4, 2022
36c8296
minor style changes addressing review comments
elstehle Apr 13, 2022
fe06f0b
squashed with bracket/brace test
elstehle Apr 11, 2022
9dfd4ad
refactored lookup tables
elstehle Apr 25, 2022
6548836
put lookup tables into their own cudf file
elstehle Apr 25, 2022
6d3eff2
fixes sg-count & uses rmm stream in fst tests
elstehle Jun 2, 2022
7fc8619
rebase on latest FST
elstehle May 3, 2022
237456d
fixes breaking changes from dependent-FST-PR
elstehle Jun 2, 2022
4aaf595
fixes for breaking downstream interface changes
elstehle Jul 13, 2022
01aef44
wraps if with stream params into detail ns
elstehle Jul 13, 2022
67f609d
renames enums & moving from device_span to ptr params
elstehle Jul 14, 2022
2f7b254
Added utility to debug print & instrumented code to use it
elstehle Mar 31, 2022
62ddf66
switched to using rmm also inside algorithm
elstehle Mar 31, 2022
d18238f
renaming key-value store op to stack_op
elstehle Apr 4, 2022
f4ec994
device_span
elstehle Apr 4, 2022
671ce41
minor style changes addressing review comments
elstehle Apr 13, 2022
f996ce9
squashed with bracket/brace test
elstehle Apr 11, 2022
a8ac5fa
refactored lookup tables
elstehle Apr 25, 2022
00a95eb
put lookup tables into their own cudf file
elstehle Apr 25, 2022
987699f
fixes sg-count & uses rmm stream in fst tests
elstehle Jun 2, 2022
ff90528
squash & rebase on latest tokenizer version
elstehle May 13, 2022
bef4fb1
moved debug print to detail ns
elstehle May 17, 2022
3e756bb
replaces tree return type from tuple to struct
elstehle Jul 18, 2022
12cf0be
fix clang-format style fix
karthikeyann Jul 26, 2022
2b59b04
Merge branch 'branch-22.08' of https://github.com/rapidsai/cudf into …
karthikeyann Jul 26, 2022
3b2acb2
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
karthikeyann Aug 11, 2022
8e75645
remove duplicate renamed header
karthikeyann Aug 11, 2022
6e1bc75
remove debug print in logical stack
karthikeyann Aug 11, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,49 @@ enum token_t : PdaTokenT {
NUM_TOKENS
};

/// Type used to represent the class of a node (or a node "category") within the tree representation
using NodeT = char;

/**
* @brief Class of a node (or a node "category") within the tree representation
*/
enum node_t : NodeT {
/// A node representing a struct
NC_STRUCT,
/// A node representing a list
NC_LIST,
/// A node representing a field name
NC_FN,
/// A node representing a string value
NC_STR,
/// A node representing a numeric or literal value (e.g., true, false, null)
NC_VAL,
/// A node representing a parser error
NC_ERR,
/// Total number of node classes
NUM_NODE_CLASSES
};

/// Type used to index into the nodes within the tree of structs, lists, field names, and value
/// nodes
using NodeIndexT = uint32_t;

/// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type
using TreeDepthT = StackLevelT;

/**
* @brief Struct that encapsulate all information of a columnar tree representation.
*/
struct tree_meta_t {
std::vector<NodeT> node_categories;
std::vector<NodeIndexT> parent_node_ids;
std::vector<TreeDepthT> node_levels;
std::vector<SymbolOffsetT> node_range_begin;
std::vector<SymbolOffsetT> node_range_end;
};

constexpr NodeIndexT parent_node_sentinel = std::numeric_limits<NodeIndexT>::max();

namespace detail {
/**
* @brief Identifies the stack context for each character from a JSON input. Specifically, we
Expand Down Expand Up @@ -110,6 +153,15 @@ void get_token_stream(device_span<SymbolT const> d_json_in,
SymbolOffsetT* d_tokens_indices,
SymbolOffsetT* d_num_written_tokens,
rmm::cuda_stream_view stream);

/**
* @brief Parses the given JSON string and generates a tree representation of the given input.
*
* @param input The JSON input
* @param stream The CUDA stream to which kernels are dispatched
* @return
*/
tree_meta_t get_tree_representation(host_span<SymbolT const> input, rmm::cuda_stream_view stream);
} // namespace detail

} // namespace cudf::io::json
175 changes: 174 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

#include <rmm/exec_policy.hpp>

#include <stack>

namespace cudf::io::json {

// JSON to stack operator DFA (Deterministic Finite Automata)
Expand Down Expand Up @@ -796,6 +798,177 @@ void get_token_stream(device_span<SymbolT const> json_in,
stream);
}

} // namespace detail
tree_meta_t get_tree_representation(host_span<SymbolT const> input, rmm::cuda_stream_view stream)
{
constexpr std::size_t single_item = 1;
hostdevice_vector<PdaTokenT> tokens_gpu{input.size(), stream};
hostdevice_vector<SymbolOffsetT> token_indices_gpu{input.size(), stream};
hostdevice_vector<SymbolOffsetT> num_tokens_out{single_item, stream};

rmm::device_uvector<SymbolT> d_input{input.size(), stream};
cudaMemcpyAsync(
d_input.data(), input.data(), input.size() * sizeof(input[0]), cudaMemcpyHostToDevice, stream);

// Parse the JSON and get the token stream
cudf::io::json::detail::get_token_stream(
cudf::device_span<SymbolT>{d_input.data(), d_input.size()},
tokens_gpu.device_ptr(),
token_indices_gpu.device_ptr(),
num_tokens_out.device_ptr(),
stream);

// Copy the JSON tokens to the host
token_indices_gpu.device_to_host(stream);
tokens_gpu.device_to_host(stream);
num_tokens_out.device_to_host(stream);

// Make sure tokens have been copied to the host
stream.synchronize();

// Whether a token does represent a node in the tree representation
auto is_node = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin:
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin:
case token_t::ErrorBegin: return true;
default: return false;
};
};

// The node that a token represents
auto token_to_node = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin: return NC_STRUCT;
case token_t::ListBegin: return NC_LIST;
case token_t::StringBegin: return NC_STR;
case token_t::ValueBegin: return NC_VAL;
case token_t::FieldNameBegin: return NC_FN;
default: return NC_ERR;
};
};

auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) {
constexpr SymbolOffsetT skip_quote_char = 1;
switch (token) {
case token_t::StringBegin: return token_index + skip_quote_char;
case token_t::FieldNameBegin: return token_index + skip_quote_char;
default: return token_index;
};
};

// Whether a token expects to be followed by its respective end-of-* token partner
auto is_begin_of_section = [](PdaTokenT const token) {
switch (token) {
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin: return true;
default: return false;
};
};

// The end-of-* partner token for a given beginning-of-* token
auto end_of_partner = [](PdaTokenT const token) {
switch (token) {
case token_t::StringBegin: return token_t::StringEnd;
case token_t::ValueBegin: return token_t::ValueEnd;
case token_t::FieldNameBegin: return token_t::FieldNameEnd;
default: return token_t::ErrorBegin;
};
};

// Whether the token pops from the parent node stack
auto does_pop = [](PdaTokenT const token) {
switch (token) {
case token_t::StructEnd:
case token_t::ListEnd: return true;
default: return false;
};
};

// Whether the token pushes onto the parent node stack
auto does_push = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin: return true;
default: return false;
};
};

// The node id sitting on top of the stack becomes the node's parent
// The full stack represents the path from the root to the current node
std::stack<std::pair<NodeIndexT, bool>> parent_stack;

constexpr bool field_name_node = true;
constexpr bool no_field_name_node = false;

std::vector<NodeT> node_categories;
std::vector<NodeIndexT> parent_node_ids;
std::vector<TreeDepthT> node_levels;
std::vector<SymbolOffsetT> node_range_begin;
std::vector<SymbolOffsetT> node_range_end;

std::size_t node_id = 0;
for (std::size_t i = 0; i < num_tokens_out[0]; i++) {
auto token = tokens_gpu[i];

// The section from the original JSON input that this token demarcates
std::size_t range_begin = get_token_index(token, token_indices_gpu[i]);
std::size_t range_end = range_begin + 1;

// Identify this node's parent node id
std::size_t parent_node_id =
(parent_stack.size() > 0) ? parent_stack.top().first : parent_node_sentinel;

// If this token is the beginning-of-{value, string, field name}, also consume the next end-of-*
// token
if (is_begin_of_section(token)) {
if ((i + 1) < num_tokens_out[0] && end_of_partner(tokens_gpu[i + 1])) {
// Update the range_end for this pair of tokens
range_end = token_indices_gpu[i + 1];
// We can skip the subsequent end-of-* token
i++;
}
}

// Emit node if this token becomes a node in the tree
if (is_node(token)) {
node_categories.push_back(token_to_node(token));
parent_node_ids.push_back(parent_node_id);
node_levels.push_back(parent_stack.size());
node_range_begin.push_back(range_begin);
node_range_end.push_back(range_end);
}

// Modify the stack if needed
if (token == token_t::FieldNameBegin) {
parent_stack.push({node_id, field_name_node});
} else {
if (does_push(token)) {
parent_stack.push({node_id, no_field_name_node});
} else if (does_pop(token)) {
CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input.");
parent_stack.pop();
}

// If what we're left with is a field name on top of stack, we need to pop it
if (parent_stack.size() >= 1 && parent_stack.top().second == field_name_node) {
parent_stack.pop();
}
}

// Update node_id
if (is_node(token)) { node_id++; }
}

return {std::move(node_categories),
std::move(parent_node_ids),
std::move(node_levels),
std::move(node_range_begin),
std::move(node_range_end)};
}

} // namespace detail
} // namespace cudf::io::json
Loading