Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds GPU implementation of JSON-token-stream to JSON-tree #11518

Merged
merged 36 commits into from
Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
cdc98d6
pull changes from PR #11291
karthikeyann Aug 11, 2022
d074579
fix minor bug, cleanup unit test
karthikeyann Aug 11, 2022
f5287a5
add struct member begin, end tokens
karthikeyann Aug 11, 2022
d694f21
add get_tree_representation_gpu initial version
karthikeyann Aug 11, 2022
96eca61
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
karthikeyann Aug 14, 2022
34ba420
delete repeated nested_json_test.cu
karthikeyann Aug 14, 2022
a083709
add print_tree debug print
karthikeyann Aug 23, 2022
1768fe7
fix valueEnd, structMemberEnd, add relevant unit test
karthikeyann Aug 24, 2022
4964826
add unit test for get_tree_representation with valueend space
karthikeyann Aug 24, 2022
4007f6a
cleanup, organize code
karthikeyann Aug 24, 2022
0c092ea
Merge branch 'branch-22.10' of https://github.com/rapidsai/cudf into …
karthikeyann Aug 26, 2022
8ff75a9
cleanup
karthikeyann Aug 26, 2022
f73a187
doc
karthikeyann Aug 26, 2022
018a779
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 1, 2022
e531cba
merge fix, review comments
karthikeyann Sep 1, 2022
24d20e6
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 1, 2022
94bdd1f
cleanup tests
karthikeyann Sep 5, 2022
edb78ad
add StructMemberEnd whitespace, newline test cases
karthikeyann Sep 5, 2022
ab1db5b
include_quote_chars change in tree generation
karthikeyann Sep 5, 2022
0ba90db
Merge branch 'branch-22.10' into fea-json-tree-gpu
karthikeyann Sep 5, 2022
15570d4
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 7, 2022
b505e76
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 8, 2022
ab4c7a1
temporary workaround until PR #11574 merge
karthikeyann Sep 8, 2022
a146936
add nvtx func ranges
karthikeyann Sep 8, 2022
6eb9bbe
address review comments
karthikeyann Sep 9, 2022
38b99cf
address review comments
karthikeyann Sep 12, 2022
285298a
use scatter instead of scatter_by_key, resuse token_levels memory
karthikeyann Sep 12, 2022
90cd2dc
review comments (davidwendt)
karthikeyann Sep 13, 2022
89188cd
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 17, 2022
2bdf767
Merge branch 'branch-22.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Sep 19, 2022
7661669
is_node return type to bool (addressed review comment)
karthikeyann Sep 19, 2022
e69038b
add json_lines test input
karthikeyann Sep 19, 2022
b8367a7
fix missing get_token_index in range_end
karthikeyann Sep 19, 2022
7c4a490
Revert "fix missing get_token_index in range_end"
karthikeyann Sep 19, 2022
d3725a8
Revert "Revert "fix missing get_token_index in range_end""
karthikeyann Sep 19, 2022
2f34d3a
remove unnecessary __host__ __device__
karthikeyann Sep 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,7 @@ add_library(
src/io/csv/writer_impl.cu
src/io/functions.cpp
src/io/json/json_gpu.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
src/io/json/reader_impl.cu
src/io/json/experimental/read_json.cpp
Expand Down
265 changes: 265 additions & 0 deletions cpp/src/io/json/json_tree.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nested_json.hpp"

#include <io/utilities/hostdevice_vector.hpp>

#include <thrust/copy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/tabulate.h>

#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

namespace cudf::io::json {
namespace detail {

// DEBUG print
template <typename T>
void print_vec(T const& cpu, std::string const name)
{
for (auto const& v : cpu)
printf("%3d,", int(v));
std::cout << name << std::endl;
}
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

// The node that a token represents
struct token_to_node {
__device__ auto operator()(PdaTokenT const token) -> NodeT
{
switch (token) {
case token_t::StructBegin: return NC_STRUCT;
case token_t::ListBegin: return NC_LIST;
case token_t::StringBegin: return NC_STR;
case token_t::ValueBegin: return NC_VAL;
case token_t::FieldNameBegin: return NC_FN;
default: return NC_ERR;
};
}
};

// Convert token indices to node range for each valid node.
template <typename T1, typename T2, typename T3>
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
struct node_ranges {
T1 tokens;
T2 token_indices;
T3 num_tokens;
__device__ auto operator()(size_type i) -> thrust::tuple<SymbolOffsetT, SymbolOffsetT>
{
// Whether a token expects to be followed by its respective end-of-* token partner
auto is_begin_of_section = [] __device__(PdaTokenT const token) {
upsj marked this conversation as resolved.
Show resolved Hide resolved
switch (token) {
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin: return true;
default: return false;
};
};
// The end-of-* partner token for a given beginning-of-* token
auto end_of_partner = [] __device__(PdaTokenT const token) {
switch (token) {
case token_t::StringBegin: return token_t::StringEnd;
case token_t::ValueBegin: return token_t::ValueEnd;
case token_t::FieldNameBegin: return token_t::FieldNameEnd;
default: return token_t::ErrorBegin;
};
};
auto get_token_index = [] __device__(PdaTokenT const token, SymbolOffsetT const token_index) {
constexpr SymbolOffsetT skip_quote_char = 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to change the logic for handling quotes, in order to distinguish between string values and literal or numeric values in type casting.

See:
https://github.com/rapidsai/cudf/pull/11574/files#diff-4a1e15bfc1ff56e474b0fb407f8fcd4e46357b8ca403022c40f7d044c2d3e949R1093-R1111

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated the code to include quotes in tree generation code. updates tests as well.

switch (token) {
case token_t::StringBegin: return token_index + skip_quote_char;
case token_t::FieldNameBegin: return token_index + skip_quote_char;
default: return token_index;
};
};
PdaTokenT const token = tokens[i];
// The section from the original JSON input that this token demarcates
SymbolOffsetT range_begin = get_token_index(token, token_indices[i]);
SymbolOffsetT range_end = range_begin + 1;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
if (is_begin_of_section(token)) {
if ((i + 1) < num_tokens && end_of_partner(token) == tokens[i + 1]) {
// Update the range_end for this pair of tokens
range_end = token_indices[i + 1];
}
}
return thrust::make_tuple(range_begin, range_end);
}
};

// Generates a tree representation of the given tokens, token_indices.
tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Whether a token does represent a node in the tree representation
auto is_node = [] __device__(PdaTokenT const token) -> size_type {
switch (token) {
case token_t::StructBegin:
case token_t::ListBegin:
case token_t::StringBegin:
case token_t::ValueBegin:
case token_t::FieldNameBegin:
case token_t::ErrorBegin: return 1;
default: return 0;
};
};
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

// Whether the token pops from the parent node stack
auto does_pop = [] __device__(PdaTokenT const token) {
switch (token) {
case token_t::StructMemberEnd:
case token_t::StructEnd:
case token_t::ListEnd: return true;
default: return false;
};
};

// Whether the token pushes onto the parent node stack
auto does_push = [] __device__(PdaTokenT const token) {
switch (token) {
case token_t::FieldNameBegin:
case token_t::StructBegin:
case token_t::ListBegin: return true;
default: return false;
};
};

auto num_tokens = tokens.size();
auto is_node_it = thrust::make_transform_iterator(tokens.begin(), is_node);
auto num_nodes = thrust::reduce(rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

// Node categories: copy_if with transform.
rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
auto node_categories_it =
thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{});
auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream),
tokens.begin(),
tokens.begin() + num_tokens,
node_categories_it,
is_node);
CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
"node category count mismatch");

// Node levels: transform_exclusive_scan, copy_if.
rmm::device_uvector<size_type> token_levels(num_tokens, stream);
auto push_pop_it = thrust::make_transform_iterator(
tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
return does_push(token) ? 1 : (does_pop(token) ? -1 : 0);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
});
thrust::exclusive_scan(
rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());

rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
token_levels.begin(),
token_levels.begin() + num_tokens,
tokens.begin(),
node_levels.begin(),
is_node);
CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");

// Node ranges: copy_if with transform.
rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
auto node_range_tuple_it =
thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
using node_ranges_t =
node_ranges<decltype(tokens.begin()), decltype(token_indices.begin()), decltype(num_tokens)>;
auto node_range_out_it = thrust::make_transform_output_iterator(
node_range_tuple_it, node_ranges_t{tokens.begin(), token_indices.begin(), num_tokens});

auto node_range_out_end =
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(0) + num_tokens,
node_range_out_it,
[is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool {
PdaTokenT const token = tokens_gpu[i];
return is_node(token);
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
});
CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");

// Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
// reorder, copy_if. This one is sort of logical stack. But more generalized.
// TODO: make it own function.
rmm::device_uvector<size_type> parent_token_ids(num_tokens, stream);
rmm::device_uvector<size_type> initial_order(num_tokens, stream);
thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end());
thrust::tabulate(rmm::exec_policy(stream),
parent_token_ids.begin(),
parent_token_ids.end(),
[does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
if (i == 0)
return -1;
else
return does_push(tokens_gpu[i - 1]) ? i - 1 : -1;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
});
auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data());
// TODO: use radix sort.
thrust::stable_sort_by_key(rmm::exec_policy(stream),
token_levels.data(),
token_levels.data() + token_levels.size(),
out_pid);
// SegmentedScan Max.
thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
token_levels.data(),
token_levels.data() + token_levels.size(),
parent_token_ids.data(),
parent_token_ids.data(), // size_type{-1},
thrust::equal_to<size_type>{},
thrust::maximum<size_type>{});
// FIXME: Avoid sorting again by scatter + extra memory. Tradeoff?
thrust::sort_by_key(rmm::exec_policy(stream),
upsj marked this conversation as resolved.
Show resolved Hide resolved
initial_order.data(),
initial_order.data() + initial_order.size(),
parent_token_ids.data());
// thrust::scatter(rmm::exec_policy(stream),
// parent_token_ids.begin(),
// parent_token_ids.end(),
// initial_order.data(),
// parent_token_ids.begin()); //same location not allowed in scatter
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
thrust::exclusive_scan(
rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
auto parent_node_ids_it = thrust::make_transform_iterator(
parent_token_ids.begin(),
[node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
return pid < 0 ? pid : node_ids_gpu[pid];
});
auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream),
parent_node_ids_it,
parent_node_ids_it + parent_token_ids.size(),
tokens.begin(),
parent_node_ids.begin(),
is_node);
CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes,
"parent node id gather mismatch");
return {std::move(node_categories),
std::move(parent_node_ids),
std::move(node_levels),
std::move(node_range_begin),
std::move(node_range_end)};
}

} // namespace detail
} // namespace cudf::io::json
29 changes: 24 additions & 5 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ using TreeDepthT = StackLevelT;
* @brief Struct that encapsulate all information of a columnar tree representation.
*/
struct tree_meta_t {
std::vector<NodeT> node_categories;
std::vector<NodeIndexT> parent_node_ids;
std::vector<TreeDepthT> node_levels;
std::vector<SymbolOffsetT> node_range_begin;
std::vector<SymbolOffsetT> node_range_end;
rmm::device_uvector<NodeT> node_categories;
rmm::device_uvector<NodeIndexT> parent_node_ids;
rmm::device_uvector<TreeDepthT> node_levels;
rmm::device_uvector<SymbolOffsetT> node_range_begin;
rmm::device_uvector<SymbolOffsetT> node_range_end;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
};

constexpr NodeIndexT parent_node_sentinel = std::numeric_limits<NodeIndexT>::max();
Expand Down Expand Up @@ -243,6 +243,10 @@ enum token_t : PdaTokenT {
ListBegin,
/// End-of-list token (on encounter of semantic ']')
ListEnd,
// Beginning-of-struct-member token
StructMemberBegin,
// End-of-struct-member token
StructMemberEnd,
/// Beginning-of-field-name token (on encounter of first quote)
FieldNameBegin,
/// End-of-field-name token (on encounter of a field name's second quote)
Expand Down Expand Up @@ -298,6 +302,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Parses the given JSON string and generates a tree representation of the given input.
*
* @param tokens Vector of token types in the json string
* @param token_indices The indices within the input string corresponding to each token
* @param stream The CUDA stream to which kernels are dispatched
* @return A tree representation of the input JSON string as vectors of node type, parent index,
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
* level, begin index, and end index in the input JSON string
*/
tree_meta_t get_tree_representation(
device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Parses the given JSON string and generates table from the given input.
*
Expand Down
Loading