Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce memory usage in nested JSON parser - tree generation #11864

Merged
merged 22 commits into from
Oct 14, 2022
Merged
Changes from 21 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
a75b0a5
fix the right condition for parent_node propagation initial condition
karthikeyann Sep 30, 2022
4abfb51
parent_node_id generation using only nodes instead of tokens
karthikeyann Oct 3, 2022
efb6621
reduce node_ids memory (not impacting peak memory)
karthikeyann Oct 6, 2022
5f250cb
reorder node_range, node_cat, scope limit token_levels
karthikeyann Oct 6, 2022
49cb0d7
use cub SortPairs to reduce memory
karthikeyann Oct 6, 2022
02a7b5b
reduce memory by cub::DoubleBuffer, scope limit token_id_for_nodes
karthikeyann Oct 6, 2022
9243d89
cleanup
karthikeyann Oct 7, 2022
7efc890
reorganize parent_node_ids algorithm (generic logical stack)
karthikeyann Oct 7, 2022
6d3a166
include CUDF_PUSH_RANGE, CUDF_POP_RANGE nvtx macros
karthikeyann Oct 7, 2022
bbcbffa
replace TreeDepthT with size_type due to cuda Invalid Device function…
karthikeyann Oct 7, 2022
483abf1
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into enh-json…
karthikeyann Oct 7, 2022
f9f0926
update docs
karthikeyann Oct 7, 2022
f851232
remove nvtx range macros and debug prints
karthikeyann Oct 7, 2022
55369c9
remove nvtx macros
karthikeyann Oct 7, 2022
5eefd64
NVTX RANGES macros commit
karthikeyann Oct 7, 2022
3bb54f4
Revert "NVTX RANGES macros commit"
karthikeyann Oct 7, 2022
b70669d
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into enh-json…
karthikeyann Oct 7, 2022
5a0a9a7
address review comments (upsj)
karthikeyann Oct 11, 2022
8578a22
Merge branch 'branch-22.12' of github.com:rapidsai/cudf into enh-json…
karthikeyann Oct 11, 2022
a356ea0
Apply suggestions from code review
karthikeyann Oct 14, 2022
7116570
address review comments
karthikeyann Oct 14, 2022
8e0c85f
add copy, memory savings comments for stable_sorted_key_order
karthikeyann Oct 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 148 additions & 85 deletions cpp/src/io/json/json_tree.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

#include <cuco/static_map.cuh>

#include <cub/device/device_radix_sort.cuh>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>
Expand All @@ -39,6 +41,7 @@
#include <thrust/fill.h>
#include <thrust/gather.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/reduce.h>
Expand Down Expand Up @@ -125,6 +128,73 @@ struct node_ranges {
}
};

/**
* @brief Returns stable sorted keys and its sorted order
*
* Uses cub stable radix sort
*
* @tparam IndexType sorted order type
* @tparam KeyType key type
* @param keys keys to sort
* @param stream CUDA stream used for device memory operations and kernel launches.
* @return Sorted keys and indices producing that sorted order
*/
template <typename IndexType = size_t, typename KeyType>
std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();

// Determine temporary device storage requirements
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(
nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);

thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());

cub::DeviceRadixSort::SortPairs(
d_temp_storage.data(), temp_storage_bytes, keys_buffer, order_buffer, keys.size());

return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
: std::move(keys_buffer2),
order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
: std::move(order_buffer2)};
}

/**
* @brief Propagate parent node to siblings from first sibling.
*
* @param node_levels Node levels of each node
* @param parent_node_ids parent node ids initialized for first child of each push node,
* and other siblings are initialized to -1.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
cudf::device_span<NodeIndexT> parent_node_ids,
rmm::cuda_stream_view stream)
{
CUDF_FUNC_RANGE();
auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
// instead of gather, using permutation_iterator, which is ~17% faster
bdice marked this conversation as resolved.
Show resolved Hide resolved

thrust::inclusive_scan_by_key(
rmm::exec_policy(stream),
sorted_node_levels.begin(),
sorted_node_levels.end(),
thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
thrust::equal_to<TreeDepthT>{},
thrust::maximum<NodeIndexT>{});
}

// Generates a tree representation of the given tokens, token_indices.
tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
device_span<SymbolOffsetT const> token_indices,
Expand Down Expand Up @@ -166,12 +236,86 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
};

auto num_tokens = tokens.size();
auto is_node_it = thrust::make_transform_iterator(
tokens.begin(),
[is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
auto num_nodes = thrust::count_if(
auto num_nodes = thrust::count_if(
rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);

// Node levels: transform_exclusive_scan, copy_if.
rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
{
rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
auto push_pop_it = thrust::make_transform_iterator(
tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
return does_push(token) - does_pop(token);
});
thrust::exclusive_scan(
rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());

auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
token_levels.begin(),
token_levels.begin() + num_tokens,
tokens.begin(),
node_levels.begin(),
is_node);
CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes,
"node level count mismatch");
}

// Node parent ids:
// previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
// This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
{
rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
thrust::copy_if(rmm::exec_policy(stream),
thrust::make_counting_iterator<NodeIndexT>(0),
thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
tokens.begin(),
node_token_ids.begin(),
is_node);

// previous push node_id
// if previous node is a push, then i-1
// if previous node is FE, then i-2 (returns FB's index)
// if previous node is SMB and its previous node is a push, then i-2
// eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent.
// else -1
auto first_childs_parent_token_id = [tokens_gpu =
tokens.begin()] __device__(auto i) -> NodeIndexT {
if (i <= 0) { return -1; }
if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) {
return i - 1;
} else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) {
return i - 2;
} else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and
(tokens_gpu[i - 2] == token_t::StructBegin ||
tokens_gpu[i - 2] == token_t::ListBegin)) {
return i - 2;
} else {
return -1;
}
};

thrust::transform(
rmm::exec_policy(stream),
node_token_ids.begin(),
node_token_ids.end(),
parent_node_ids.begin(),
[node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
NodeIndexT const tid) -> NodeIndexT {
auto pid = first_childs_parent_token_id(tid);
return pid < 0
? parent_node_sentinel
: thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will probably be a good point to do some precomputations (hashmap or sparse bitmap) if this kernel becomes a performance issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one takes ~20% of the parent_node_ids computation time. it take significant time, but not the critical one.
hash_map takes extra memory. Tried it now. memory increases from 7.97 GiB to 9.271 GiB. It's much slower than lower_bound; 12 ms lower_bound, vs 133 ms hash_map.

I am interested to learn about the "sparse bitmap" approach .

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's an approach I often use to speed up binary search-like lookups where the indices are unique. The fundamental idea is a bitvector with rank support:
You store a bitvector containing a 1 if the corresponding token is a node and 0 in groups of 32 bit words and compute an exclusive prefix sum over the popcount of the words. Computing the lower bound is then prefix_sum[i / 32] + popcnt(prefix_mask(i % 32) & bitvector[i / 32]) where prefix_mask(i) = (1u<< i) - 1 has all bits smaller than its parameter set. Overall, this uses 2 bits of storage per token. If the number of tokens is much larger than the number of nodes, you can make the data structure even sparser if you only store 32 bit words that are not all 0 (basically a reduce_by_key over the tokens) and use a normal bitvector to store a bit for each 32 bit word denoting whether it was non-zero. Then you have a two-level lookup (though the second level could also be another lookup data structure like a hashmap). The data structure has pretty good caching properties, since locality in indices translates to locality in memory, which hashmaps purposefully don't have.
But with the number of nodes being not smaller than the number of tokens by a huge factor, I don't think this would be worth the effort.

node_ids_gpu;
// parent_node_sentinel is -1, useful for segmented max operation below
});
}
// Propagate parent node to siblings from first sibling - inplace.
propagate_parent_to_siblings(
cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
parent_node_ids,
stream);

// Node categories: copy_if with transform.
rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
auto node_categories_it =
Expand All @@ -184,24 +328,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
"node category count mismatch");

// Node levels: transform_exclusive_scan, copy_if.
rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
auto push_pop_it = thrust::make_transform_iterator(
tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
return does_push(token) - does_pop(token);
});
thrust::exclusive_scan(
rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());

rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
token_levels.begin(),
token_levels.begin() + num_tokens,
tokens.begin(),
node_levels.begin(),
is_node);
CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");

// Node ranges: copy_if with transform.
rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
Expand All @@ -223,69 +349,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
});
CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");

// Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
// reorder, copy_if. This one is sort of logical stack. But more generalized.
// TODO: make it own function.
rmm::device_uvector<size_type> parent_token_ids(num_tokens, stream);
rmm::device_uvector<size_type> initial_order(num_tokens, stream);
// TODO re-write the algorithm to work only on nodes, not tokens.

thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end());
thrust::tabulate(rmm::exec_policy(stream),
parent_token_ids.begin(),
parent_token_ids.end(),
[does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1;
// -1, not sentinel used here because of max operation below
});

auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data());
// Uses radix sort for builtin types.
thrust::stable_sort_by_key(rmm::exec_policy(stream),
token_levels.data(),
token_levels.data() + token_levels.size(),
out_pid);

// SegmentedScan Max.
thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
token_levels.data(),
token_levels.data() + token_levels.size(),
parent_token_ids.data(),
parent_token_ids.data(),
thrust::equal_to<size_type>{},
thrust::maximum<size_type>{});

// scatter to restore the original order.
{
rmm::device_uvector<size_type> temp_storage(num_tokens, stream);
thrust::scatter(rmm::exec_policy(stream),
parent_token_ids.begin(),
parent_token_ids.end(),
initial_order.begin(),
temp_storage.begin());
thrust::copy(
rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin());
}

rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
thrust::exclusive_scan(
rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());

rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
auto parent_node_ids_it = thrust::make_transform_iterator(
parent_token_ids.begin(),
[node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
});
auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream),
parent_node_ids_it,
parent_node_ids_it + parent_token_ids.size(),
tokens.begin(),
parent_node_ids.begin(),
is_node);
CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes,
"parent node id gather mismatch");

return {std::move(node_categories),
std::move(parent_node_ids),
std::move(node_levels),
Expand Down