diff --git a/cpp/src/io/json/nested_json.h b/cpp/src/io/json/nested_json.h index 911c8a734c0..4d67a14da45 100644 --- a/cpp/src/io/json/nested_json.h +++ b/cpp/src/io/json/nested_json.h @@ -56,11 +56,16 @@ using NodeIndexT = uint32_t; /// Type large enough to represent tree depth from [0, max-tree-depth); may be an unsigned type using TreeDepthT = StackLevelT; -using tree_meta_t = std::tuple, - std::vector, - std::vector, - std::vector, - std::vector>; +/** + * @brief Struct that encapsulate all information of a columnar tree representation. + */ +struct tree_meta_t { + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_levels; + std::vector node_range_begin; + std::vector node_range_end; +}; constexpr NodeIndexT parent_node_sentinel = std::numeric_limits::max(); @@ -94,7 +99,6 @@ enum token_t : PdaTokenT { NUM_TOKENS }; -namespace detail { /** * @brief Class of a node (or a node "category") within the tree representation */ @@ -115,6 +119,8 @@ enum node_t : NodeT { NUM_NODE_CLASSES }; +namespace detail { + /** * @brief Identifies the stack context for each character from a JSON input. Specifically, we * identify brackets and braces outside of quoted fields (e.g., field names, strings). @@ -145,6 +151,16 @@ void get_token_stream(device_span d_json_in, SymbolOffsetT* d_tokens_indices, SymbolOffsetT* d_num_written_tokens, rmm::cuda_stream_view stream); + +/** + * @brief Parses the given JSON string and generates a tree representation of the given input. + * + * @param input The JSON input + * @param stream The CUDA stream to which kernels are dispatched + * @return + */ +tree_meta_t get_tree_representation(host_span input, rmm::cuda_stream_view stream); + } // namespace detail } // namespace cudf::io::json::gpu diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 07994ea2fa7..fbe78b08003 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -25,6 +25,8 @@ #include +#include + namespace cudf::io::json::gpu { //------------------------------------------------------------------------------ @@ -618,6 +620,177 @@ void get_token_stream(device_span d_json_in, stream); } -} // namespace detail +tree_meta_t get_tree_representation(host_span input, rmm::cuda_stream_view stream) +{ + constexpr std::size_t single_item = 1; + hostdevice_vector tokens_gpu{input.size(), stream}; + hostdevice_vector token_indices_gpu{input.size(), stream}; + hostdevice_vector num_tokens_out{single_item, stream}; + + rmm::device_uvector d_input{input.size(), stream}; + cudaMemcpyAsync( + d_input.data(), input.data(), input.size() * sizeof(input[0]), cudaMemcpyHostToDevice, stream); + + // Parse the JSON and get the token stream + cudf::io::json::gpu::detail::get_token_stream( + cudf::device_span{d_input.data(), d_input.size()}, + tokens_gpu.device_ptr(), + token_indices_gpu.device_ptr(), + num_tokens_out.device_ptr(), + stream); + // Copy the JSON tokens to the host + token_indices_gpu.device_to_host(stream); + tokens_gpu.device_to_host(stream); + num_tokens_out.device_to_host(stream); + + // Make sure tokens have been copied to the host + stream.synchronize(); + + // Whether a token does represent a node in the tree representation + auto is_node = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: + case token_t::ErrorBegin: return true; + default: return false; + }; + }; + + // The node that a token represents + auto token_to_node = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: return NC_STRUCT; + case token_t::ListBegin: return NC_LIST; + case token_t::StringBegin: return NC_STR; + case token_t::ValueBegin: return NC_VAL; + case token_t::FieldNameBegin: return NC_FN; + default: return NC_ERR; + }; + }; + + auto get_token_index = [](PdaTokenT const token, SymbolOffsetT const token_index) { + constexpr SymbolOffsetT skip_quote_char = 1; + switch (token) { + case token_t::StringBegin: return token_index + skip_quote_char; + case token_t::FieldNameBegin: return token_index + skip_quote_char; + default: return token_index; + }; + }; + + // Whether a token expects to be followed by its respective end-of-* token partner + auto is_begin_of_section = [](PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: return true; + default: return false; + }; + }; + + // The end-of-* partner token for a given beginning-of-* token + auto end_of_partner = [](PdaTokenT const token) { + switch (token) { + case token_t::StringBegin: return token_t::StringEnd; + case token_t::ValueBegin: return token_t::ValueEnd; + case token_t::FieldNameBegin: return token_t::FieldNameEnd; + default: return token_t::ErrorBegin; + }; + }; + + // Whether the token pops from the parent node stack + auto does_pop = [](PdaTokenT const token) { + switch (token) { + case token_t::StructEnd: + case token_t::ListEnd: return true; + default: return false; + }; + }; + + // Whether the token pushes onto the parent node stack + auto does_push = [](PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: return true; + default: return false; + }; + }; + + // The node id sitting on top of the stack becomes the node's parent + // The full stack represents the path from the root to the current node + std::stack> parent_stack; + + constexpr bool field_name_node = true; + constexpr bool no_field_name_node = false; + + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_levels; + std::vector node_range_begin; + std::vector node_range_end; + + std::size_t node_id = 0; + for (std::size_t i = 0; i < num_tokens_out[0]; i++) { + auto token = tokens_gpu[i]; + + // The section from the original JSON input that this token demarcates + std::size_t range_begin = get_token_index(token, token_indices_gpu[i]); + std::size_t range_end = range_begin + 1; + + // Identify this node's parent node id + std::size_t parent_node_id = + (parent_stack.size() > 0) ? parent_stack.top().first : parent_node_sentinel; + + // If this token is the beginning-of-{value, string, field name}, also consume the next end-of-* + // token + if (is_begin_of_section(token)) { + if ((i + 1) < num_tokens_out[0] && end_of_partner(tokens_gpu[i + 1])) { + // Update the range_end for this pair of tokens + range_end = token_indices_gpu[i + 1]; + // We can skip the subsequent end-of-* token + i++; + } + } + + // Emit node if this token becomes a node in the tree + if (is_node(token)) { + node_categories.push_back(token_to_node(token)); + parent_node_ids.push_back(parent_node_id); + node_levels.push_back(parent_stack.size()); + node_range_begin.push_back(range_begin); + node_range_end.push_back(range_end); + } + + // Modify the stack if needed + if (token == token_t::FieldNameBegin) { + parent_stack.push({node_id, field_name_node}); + } else { + if (does_push(token)) { + parent_stack.push({node_id, no_field_name_node}); + } else if (does_pop(token)) { + CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input."); + parent_stack.pop(); + } + + // If what we're left with is a field name on top of stack, we need to pop it + if (parent_stack.size() >= 1 && parent_stack.top().second == field_name_node) { + parent_stack.pop(); + } + } + + // Update node_id + if (is_node(token)) { node_id++; } + } + + return {std::move(node_categories), + std::move(parent_node_ids), + std::move(node_levels), + std::move(node_range_begin), + std::move(node_range_end)}; +} + +} // namespace detail } // namespace cudf::io::json::gpu diff --git a/cpp/tests/io/nested_json_test.cu b/cpp/tests/io/nested_json_test.cu index e00148597df..6116de31f14 100644 --- a/cpp/tests/io/nested_json_test.cu +++ b/cpp/tests/io/nested_json_test.cu @@ -31,12 +31,6 @@ std::string get_node_string(std::size_t const node_id, nested_json::tree_meta_t const& tree_rep, std::string const& json_input) { - auto const& node_categories = std::get<0>(tree_rep); - auto const& parent_node_ids = std::get<1>(tree_rep); - auto const& node_levels = std::get<2>(tree_rep); - auto const& node_range_begin = std::get<3>(tree_rep); - auto const& node_range_end = std::get<4>(tree_rep); - auto node_to_str = [] __host__ __device__(nested_json::PdaTokenT const token) { switch (token) { case nested_json::NC_STRUCT: return "STRUCT"; @@ -49,25 +43,24 @@ std::string get_node_string(std::size_t const node_id, }; }; - return "<" + std::to_string(node_id) + ":" + node_to_str(node_categories[node_id]) + ":[" + - std::to_string(node_range_begin[node_id]) + ", " + - std::to_string(node_range_end[node_id]) + ") '" + - json_input.substr(node_range_begin[node_id], - node_range_end[node_id] - node_range_begin[node_id]) + + return "<" + std::to_string(node_id) + ":" + node_to_str(tree_rep.node_categories[node_id]) + + ":[" + std::to_string(tree_rep.node_range_begin[node_id]) + ", " + + std::to_string(tree_rep.node_range_end[node_id]) + ") '" + + json_input.substr(tree_rep.node_range_begin[node_id], + tree_rep.node_range_end[node_id] - tree_rep.node_range_begin[node_id]) + "'>"; } void print_tree_representation(std::string const& json_input, nested_json::tree_meta_t const& tree_rep) { - for (std::size_t i = 0; i < std::get<0>(tree_rep).size(); i++) { - auto const& parent_node_ids = std::get<1>(tree_rep); - std::size_t parent_id = parent_node_ids[i]; + for (std::size_t i = 0; i < tree_rep.node_categories.size(); i++) { + std::size_t parent_id = tree_rep.parent_node_ids[i]; std::stack path; path.push(i); while (parent_id != nested_json::parent_node_sentinel) { path.push(parent_id); - parent_id = parent_node_ids[parent_id]; + parent_id = tree_rep.parent_node_ids[parent_id]; } while (path.size()) { @@ -271,18 +264,12 @@ TEST_F(JsonTest, TreeRepresentation) R"(}] )"; // Get the JSON's tree representation - auto tree_rep = nested_json::get_tree_representation( + auto tree_rep = nested_json::detail::get_tree_representation( cudf::host_span{input.data(), input.size()}, stream_view); // Print tree representation if (std::getenv("CUDA_DBG_DUMP") != nullptr) { print_tree_representation(input, tree_rep); } - auto const& node_categories = std::get<0>(tree_rep); - auto const& parent_node_ids = std::get<1>(tree_rep); - auto const& node_levels = std::get<2>(tree_rep); - auto const& node_range_begin = std::get<3>(tree_rep); - auto const& node_range_end = std::get<4>(tree_rep); - // Golden sample of node categories std::vector golden_node_categories = { nested_json::NC_LIST, nested_json::NC_STRUCT, nested_json::NC_FN, nested_json::NC_STR, @@ -347,17 +334,17 @@ TEST_F(JsonTest, TreeRepresentation) 147, 155, 159, 160, 162, 168, 170, 172, 175, 176, 181, 195, 209, 217, 252, 260, 267}; // Check results against golden samples - ASSERT_EQ(golden_node_categories.size(), node_categories.size()); - ASSERT_EQ(golden_parent_node_ids.size(), parent_node_ids.size()); - ASSERT_EQ(golden_node_levels.size(), node_levels.size()); - ASSERT_EQ(golden_node_range_begin.size(), node_range_begin.size()); - ASSERT_EQ(golden_node_range_end.size(), node_range_end.size()); + ASSERT_EQ(golden_node_categories.size(), tree_rep.node_categories.size()); + ASSERT_EQ(golden_parent_node_ids.size(), tree_rep.parent_node_ids.size()); + ASSERT_EQ(golden_node_levels.size(), tree_rep.node_levels.size()); + ASSERT_EQ(golden_node_range_begin.size(), tree_rep.node_range_begin.size()); + ASSERT_EQ(golden_node_range_end.size(), tree_rep.node_range_end.size()); for (std::size_t i = 0; i < golden_node_categories.size(); i++) { - ASSERT_EQ(golden_node_categories[i], node_categories[i]); - ASSERT_EQ(golden_parent_node_ids[i], parent_node_ids[i]); - ASSERT_EQ(golden_node_levels[i], node_levels[i]); - ASSERT_EQ(golden_node_range_begin[i], node_range_begin[i]); - ASSERT_EQ(golden_node_range_end[i], node_range_end[i]); + ASSERT_EQ(golden_node_categories[i], tree_rep.node_categories[i]); + ASSERT_EQ(golden_parent_node_ids[i], tree_rep.parent_node_ids[i]); + ASSERT_EQ(golden_node_levels[i], tree_rep.node_levels[i]); + ASSERT_EQ(golden_node_range_begin[i], tree_rep.node_range_begin[i]); + ASSERT_EQ(golden_node_range_end[i], tree_rep.node_range_end[i]); } }