Skip to content

Commit

Permalink
replaces tree return type from tuple to struct
Browse files Browse the repository at this point in the history
  • Loading branch information
elstehle committed Jul 18, 2022
1 parent bef4fb1 commit 3e756bb
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 39 deletions.
28 changes: 22 additions & 6 deletions cpp/src/io/json/nested_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,16 @@ using NodeIndexT = uint32_t;
/// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type
using TreeDepthT = StackLevelT;

using tree_meta_t = std::tuple<std::vector<NodeT>,
std::vector<NodeIndexT>,
std::vector<TreeDepthT>,
std::vector<SymbolOffsetT>,
std::vector<SymbolOffsetT>>;
/**
* @brief Struct that encapsulate all information of a columnar tree representation.
*/
struct tree_meta_t {
std::vector<NodeT> node_categories;
std::vector<NodeIndexT> parent_node_ids;
std::vector<TreeDepthT> node_levels;
std::vector<SymbolOffsetT> node_range_begin;
std::vector<SymbolOffsetT> node_range_end;
};

constexpr NodeIndexT parent_node_sentinel = std::numeric_limits<NodeIndexT>::max();

Expand Down Expand Up @@ -94,7 +99,6 @@ enum token_t : PdaTokenT {
NUM_TOKENS
};

namespace detail {
/**
* @brief Class of a node (or a node "category") within the tree representation
*/
Expand All @@ -115,6 +119,8 @@ enum node_t : NodeT {
NUM_NODE_CLASSES
};

namespace detail {

/**
* @brief Identifies the stack context for each character from a JSON input. Specifically, we
* identify brackets and braces outside of quoted fields (e.g., field names, strings).
Expand Down Expand Up @@ -145,6 +151,16 @@ void get_token_stream(device_span<SymbolT const> d_json_in,
SymbolOffsetT* d_tokens_indices,
SymbolOffsetT* d_num_written_tokens,
rmm::cuda_stream_view stream);

/**
* @brief Parses the given JSON string and generates a tree representation of the given input.
*
* @param input The JSON input
* @param stream The CUDA stream to which kernels are dispatched
* @return
*/
tree_meta_t get_tree_representation(host_span<SymbolT const> input, rmm::cuda_stream_view stream);

} // namespace detail

} // namespace cudf::io::json::gpu
175 changes: 174 additions & 1 deletion cpp/src/io/json/nested_json_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

#include <rmm/exec_policy.hpp>

#include <stack>

namespace cudf::io::json::gpu {

//------------------------------------------------------------------------------
Expand Down Expand Up @@ -618,6 +620,177 @@ void get_token_stream(device_span<SymbolT const> d_json_in,
stream);
}

} // namespace detail
tree_meta_t get_tree_representation(host_span<SymbolT const> input, rmm::cuda_stream_view stream)
{
constexpr std::size_t single_item = 1;
hostdevice_vector<PdaTokenT> tokens_gpu{input.size(), stream};
hostdevice_vector<SymbolOffsetT> token_indices_gpu{input.size(), stream};
hostdevice_vector<SymbolOffsetT> num_tokens_out{single_item, stream};

rmm::device_uvector<SymbolT> d_input{input.size(), stream};
cudaMemcpyAsync(
d_input.data(), input.data(), input.size() * sizeof(input[0]), cudaMemcpyHostToDevice, stream);

// Parse the JSON and get the token stream
cudf::io::json::gpu::detail::get_token_stream(
cudf::device_span<SymbolT>{d_input.data(), d_input.size()},
tokens_gpu.device_ptr(),
token_indices_gpu.device_ptr(),
num_tokens_out.device_ptr(),
stream);

// Copy the JSON tokens to the host
token_indices_gpu.device_to_host(stream);
tokens_gpu.device_to_host(stream);
num_tokens_out.device_to_host(stream);

// Make sure tokens have been copied to the host
stream.synchronize();

// Whether a token does represent a node in the tree representation
auto is_node = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin:
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin:
case token_t::ErrorBegin: return true;
default: return false;
};
};

// The node that a token represents
auto token_to_node = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin: return NC_STRUCT;
case token_t::ListBegin: return NC_LIST;
case token_t::StringBegin: return NC_STR;
case token_t::ValueBegin: return NC_VAL;
case token_t::FieldNameBegin: return NC_FN;
default: return NC_ERR;
};
};

auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) {
constexpr SymbolOffsetT skip_quote_char = 1;
switch (token) {
case token_t::StringBegin: return token_index + skip_quote_char;
case token_t::FieldNameBegin: return token_index + skip_quote_char;
default: return token_index;
};
};

// Whether a token expects to be followed by its respective end-of-* token partner
auto is_begin_of_section = [](PdaTokenT const token) {
switch (token) {
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin: return true;
default: return false;
};
};

// The end-of-* partner token for a given beginning-of-* token
auto end_of_partner = [](PdaTokenT const token) {
switch (token) {
case token_t::StringBegin: return token_t::StringEnd;
case token_t::ValueBegin: return token_t::ValueEnd;
case token_t::FieldNameBegin: return token_t::FieldNameEnd;
default: return token_t::ErrorBegin;
};
};

// Whether the token pops from the parent node stack
auto does_pop = [](PdaTokenT const token) {
switch (token) {
case token_t::StructEnd:
case token_t::ListEnd: return true;
default: return false;
};
};

// Whether the token pushes onto the parent node stack
auto does_push = [](PdaTokenT const token) {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin: return true;
default: return false;
};
};

// The node id sitting on top of the stack becomes the node's parent
// The full stack represents the path from the root to the current node
std::stack<std::pair<NodeIndexT, bool>> parent_stack;

constexpr bool field_name_node = true;
constexpr bool no_field_name_node = false;

std::vector<NodeT> node_categories;
std::vector<NodeIndexT> parent_node_ids;
std::vector<TreeDepthT> node_levels;
std::vector<SymbolOffsetT> node_range_begin;
std::vector<SymbolOffsetT> node_range_end;

std::size_t node_id = 0;
for (std::size_t i = 0; i < num_tokens_out[0]; i++) {
auto token = tokens_gpu[i];

// The section from the original JSON input that this token demarcates
std::size_t range_begin = get_token_index(token, token_indices_gpu[i]);
std::size_t range_end = range_begin + 1;

// Identify this node's parent node id
std::size_t parent_node_id =
(parent_stack.size() > 0) ? parent_stack.top().first : parent_node_sentinel;

// If this token is the beginning-of-{value, string, field name}, also consume the next end-of-*
// token
if (is_begin_of_section(token)) {
if ((i + 1) < num_tokens_out[0] && end_of_partner(tokens_gpu[i + 1])) {
// Update the range_end for this pair of tokens
range_end = token_indices_gpu[i + 1];
// We can skip the subsequent end-of-* token
i++;
}
}

// Emit node if this token becomes a node in the tree
if (is_node(token)) {
node_categories.push_back(token_to_node(token));
parent_node_ids.push_back(parent_node_id);
node_levels.push_back(parent_stack.size());
node_range_begin.push_back(range_begin);
node_range_end.push_back(range_end);
}

// Modify the stack if needed
if (token == token_t::FieldNameBegin) {
parent_stack.push({node_id, field_name_node});
} else {
if (does_push(token)) {
parent_stack.push({node_id, no_field_name_node});
} else if (does_pop(token)) {
CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input.");
parent_stack.pop();
}

// If what we're left with is a field name on top of stack, we need to pop it
if (parent_stack.size() >= 1 && parent_stack.top().second == field_name_node) {
parent_stack.pop();
}
}

// Update node_id
if (is_node(token)) { node_id++; }
}

return {std::move(node_categories),
std::move(parent_node_ids),
std::move(node_levels),
std::move(node_range_begin),
std::move(node_range_end)};
}

} // namespace detail
} // namespace cudf::io::json::gpu
51 changes: 19 additions & 32 deletions cpp/tests/io/nested_json_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,6 @@ std::string get_node_string(std::size_t const node_id,
nested_json::tree_meta_t const& tree_rep,
std::string const& json_input)
{
auto const& node_categories = std::get<0>(tree_rep);
auto const& parent_node_ids = std::get<1>(tree_rep);
auto const& node_levels = std::get<2>(tree_rep);
auto const& node_range_begin = std::get<3>(tree_rep);
auto const& node_range_end = std::get<4>(tree_rep);

auto node_to_str = [] __host__ __device__(nested_json::PdaTokenT const token) {
switch (token) {
case nested_json::NC_STRUCT: return "STRUCT";
Expand All @@ -49,25 +43,24 @@ std::string get_node_string(std::size_t const node_id,
};
};

return "<" + std::to_string(node_id) + ":" + node_to_str(node_categories[node_id]) + ":[" +
std::to_string(node_range_begin[node_id]) + ", " +
std::to_string(node_range_end[node_id]) + ") '" +
json_input.substr(node_range_begin[node_id],
node_range_end[node_id] - node_range_begin[node_id]) +
return "<" + std::to_string(node_id) + ":" + node_to_str(tree_rep.node_categories[node_id]) +
":[" + std::to_string(tree_rep.node_range_begin[node_id]) + ", " +
std::to_string(tree_rep.node_range_end[node_id]) + ") '" +
json_input.substr(tree_rep.node_range_begin[node_id],
tree_rep.node_range_end[node_id] - tree_rep.node_range_begin[node_id]) +
"'>";
}

void print_tree_representation(std::string const& json_input,
nested_json::tree_meta_t const& tree_rep)
{
for (std::size_t i = 0; i < std::get<0>(tree_rep).size(); i++) {
auto const& parent_node_ids = std::get<1>(tree_rep);
std::size_t parent_id = parent_node_ids[i];
for (std::size_t i = 0; i < tree_rep.node_categories.size(); i++) {
std::size_t parent_id = tree_rep.parent_node_ids[i];
std::stack<std::size_t> path;
path.push(i);
while (parent_id != nested_json::parent_node_sentinel) {
path.push(parent_id);
parent_id = parent_node_ids[parent_id];
parent_id = tree_rep.parent_node_ids[parent_id];
}

while (path.size()) {
Expand Down Expand Up @@ -271,18 +264,12 @@ TEST_F(JsonTest, TreeRepresentation)
R"(}] )";

// Get the JSON's tree representation
auto tree_rep = nested_json::get_tree_representation(
auto tree_rep = nested_json::detail::get_tree_representation(
cudf::host_span<SymbolT const>{input.data(), input.size()}, stream_view);

// Print tree representation
if (std::getenv("CUDA_DBG_DUMP") != nullptr) { print_tree_representation(input, tree_rep); }

auto const& node_categories = std::get<0>(tree_rep);
auto const& parent_node_ids = std::get<1>(tree_rep);
auto const& node_levels = std::get<2>(tree_rep);
auto const& node_range_begin = std::get<3>(tree_rep);
auto const& node_range_end = std::get<4>(tree_rep);

// Golden sample of node categories
std::vector<nested_json::node_t> golden_node_categories = {
nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_FN, nested_json::NC_STR,
Expand Down Expand Up @@ -347,17 +334,17 @@ TEST_F(JsonTest, TreeRepresentation)
147, 155, 159, 160, 162, 168, 170, 172, 175, 176, 181, 195, 209, 217, 252, 260, 267};

// Check results against golden samples
ASSERT_EQ(golden_node_categories.size(), node_categories.size());
ASSERT_EQ(golden_parent_node_ids.size(), parent_node_ids.size());
ASSERT_EQ(golden_node_levels.size(), node_levels.size());
ASSERT_EQ(golden_node_range_begin.size(), node_range_begin.size());
ASSERT_EQ(golden_node_range_end.size(), node_range_end.size());
ASSERT_EQ(golden_node_categories.size(), tree_rep.node_categories.size());
ASSERT_EQ(golden_parent_node_ids.size(), tree_rep.parent_node_ids.size());
ASSERT_EQ(golden_node_levels.size(), tree_rep.node_levels.size());
ASSERT_EQ(golden_node_range_begin.size(), tree_rep.node_range_begin.size());
ASSERT_EQ(golden_node_range_end.size(), tree_rep.node_range_end.size());

for (std::size_t i = 0; i < golden_node_categories.size(); i++) {
ASSERT_EQ(golden_node_categories[i], node_categories[i]);
ASSERT_EQ(golden_parent_node_ids[i], parent_node_ids[i]);
ASSERT_EQ(golden_node_levels[i], node_levels[i]);
ASSERT_EQ(golden_node_range_begin[i], node_range_begin[i]);
ASSERT_EQ(golden_node_range_end[i], node_range_end[i]);
ASSERT_EQ(golden_node_categories[i], tree_rep.node_categories[i]);
ASSERT_EQ(golden_parent_node_ids[i], tree_rep.parent_node_ids[i]);
ASSERT_EQ(golden_node_levels[i], tree_rep.node_levels[i]);
ASSERT_EQ(golden_node_range_begin[i], tree_rep.node_range_begin[i]);
ASSERT_EQ(golden_node_range_end[i], tree_rep.node_range_end[i]);
}
}

0 comments on commit 3e756bb

Please sign in to comment.