From a75b0a5f13300f62fa7cf514651f105bd988d051 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 30 Sep 2022 13:08:16 +0530 Subject: [PATCH 01/19] fix the right condition for parent_node propagation initial condition --- cpp/src/io/json/json_tree.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index dbf026c351e..19fb8f2c980 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -235,7 +235,12 @@ tree_meta_t get_tree_representation(device_span tokens, parent_token_ids.begin(), parent_token_ids.end(), [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { - return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1; + return (i > 0) + ? ((tokens_gpu[i - 1] == token_t::StructBegin || + tokens_gpu[i - 1] == token_t::ListBegin) + ? i - 1 + : (tokens_gpu[i - 1] == token_t::FieldNameEnd ? i - 2 : -1)) + : -1; // -1, not sentinel used here because of max operation below }); From 4abfb5147c3e065d2d06560e94bdad725f823d6d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 3 Oct 2022 22:11:59 +0530 Subject: [PATCH 02/19] parent_node_id generation using only nodes instead of tokens reduces memory usage by 35% (1GB json takes 10.951GB instead of 16.957GB) --- cpp/src/io/json/json_tree.cu | 167 ++++++++++++++++++++++++++--------- 1 file changed, 124 insertions(+), 43 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 19fb8f2c980..97320c70c30 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -53,6 +53,46 @@ namespace cudf::io::json { namespace detail { +// DEBUG print +[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string { + switch (token) { + case token_t::StructBegin: return " {"; + case token_t::StructEnd: return " }"; + case token_t::ListBegin: return " ["; + case token_t::ListEnd: return " ]"; + case token_t::FieldNameBegin: return "FB"; + case token_t::FieldNameEnd: return "FE"; + case token_t::StringBegin: return "SB"; + case token_t::StringEnd: return "SE"; + case token_t::ErrorBegin: return "er"; + case token_t::ValueBegin: return "VB"; + case token_t::ValueEnd: return "VE"; + case token_t::StructMemberBegin: return " <"; + case token_t::StructMemberEnd: return " >"; + default: return "."; + } +}; +// DEBUG prints +auto to_cat = [](auto v) -> std::string { + switch (v) { + case NC_STRUCT: return " S"; + case NC_LIST: return " L"; + case NC_STR: return " \""; + case NC_VAL: return " V"; + case NC_FN: return " F"; + case NC_ERR: return "ER"; + default: return "UN"; + }; +}; +auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; +auto print_vec = [](auto const& gpu, auto const name, auto converter) { + return; + auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value); + for (auto const& v : cpu) + printf("%3s,", converter(v).c_str()); + std::cout << name << std::endl; +}; + // The node that a token represents struct token_to_node { __device__ auto operator()(PdaTokenT const token) -> NodeT @@ -165,13 +205,16 @@ tree_meta_t get_tree_representation(device_span tokens, }; }; + CUDF_PUSH_RANGE("num_nodes"); auto num_tokens = tokens.size(); auto is_node_it = thrust::make_transform_iterator( tokens.begin(), [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); auto num_nodes = thrust::count_if( rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("node_categories"); // Node categories: copy_if with transform. rmm::device_uvector node_categories(num_nodes, stream, mr); auto node_categories_it = @@ -183,7 +226,9 @@ tree_meta_t get_tree_representation(device_span tokens, is_node); CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, "node category count mismatch"); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("token_levels"); // Node levels: transform_exclusive_scan, copy_if. rmm::device_uvector token_levels(num_tokens, stream); auto push_pop_it = thrust::make_transform_iterator( @@ -192,7 +237,9 @@ tree_meta_t get_tree_representation(device_span tokens, }); thrust::exclusive_scan( rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("node_levels"); rmm::device_uvector node_levels(num_nodes, stream, mr); auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), token_levels.begin(), @@ -201,7 +248,9 @@ tree_meta_t get_tree_representation(device_span tokens, node_levels.begin(), is_node); CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("node_ranges"); // Node ranges: copy_if with transform. rmm::device_uvector node_range_begin(num_nodes, stream, mr); rmm::device_uvector node_range_end(num_nodes, stream, mr); @@ -222,77 +271,109 @@ tree_meta_t get_tree_representation(device_span tokens, return is_node(tokens_gpu[i]); }); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("parent_token_ids"); // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max, // reorder, copy_if. This one is sort of logical stack. But more generalized. // TODO: make it own function. - rmm::device_uvector parent_token_ids(num_tokens, stream); - rmm::device_uvector initial_order(num_tokens, stream); - // TODO re-write the algorithm to work only on nodes, not tokens. - - thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end()); - thrust::tabulate(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { - return (i > 0) - ? ((tokens_gpu[i - 1] == token_t::StructBegin || - tokens_gpu[i - 1] == token_t::ListBegin) - ? i - 1 - : (tokens_gpu[i - 1] == token_t::FieldNameEnd ? i - 2 : -1)) - : -1; - // -1, not sentinel used here because of max operation below - }); - auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data()); + // TODO re-write the algorithm to work only on nodes, not tokens. + // // ### only push nodes matter for scan! (verify throughly if true. For i-1 == FE, then i-3 + // matters. or i-2 because FB and SMB treated same. make sure nodeid matches right for FB/SMB. + // // now copy only push operations to seperate array with token_levels, & node_id. + // // sort by level, then scan it. then scatter to node_id positions. + // // then another scan for non-push nodes? or another scan before scatter? L/S SMB FE. + // // total memory: num_nodes*(4b+b+4b) <= 9b*num_nodes. + rmm::device_uvector parent_token_ids2(num_nodes, stream); + auto prev_parent_node_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { + return (i > 0) ? ((tokens_gpu[i - 1] == token_t::StructBegin || + tokens_gpu[i - 1] == token_t::ListBegin) + ? i - 1 + : ((tokens_gpu[i - 1] == token_t::FieldNameEnd || + (tokens_gpu[i - 1] == token_t::StructMemberBegin && + (tokens_gpu[i - 2] == token_t::StructBegin || + tokens_gpu[i - 2] == token_t::ListBegin))) + ? i - 2 + : -1)) + : -1; + // -1, not sentinel used here because of max operation below + }); + thrust::copy_if(rmm::exec_policy(stream), + prev_parent_node_it, + prev_parent_node_it + num_tokens, + tokens.begin(), + parent_token_ids2.begin(), + is_node); + rmm::device_uvector initial_order2(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end()); + auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data()); // Uses radix sort for builtin types. thrust::stable_sort_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - out_pid); + node_levels.data(), + node_levels.data() + node_levels.size(), + out_pid2); - // SegmentedScan Max. thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - parent_token_ids.data(), - parent_token_ids.data(), + node_levels.data(), + node_levels.data() + node_levels.size(), + parent_token_ids2.data(), + parent_token_ids2.data(), thrust::equal_to{}, thrust::maximum{}); + // thrust::sort_by_key(rmm::exec_policy(stream), + // initial_order2.data(), + // initial_order2.data() + initial_order2.size(), + // thrust::make_zip_iterator(parent_token_ids2.data(), + // node_levels.data())); // scatter to restore the original order. { - rmm::device_uvector temp_storage(num_tokens, stream); + CUDF_PUSH_RANGE("scatter"); + rmm::device_uvector temp_storage(num_nodes, stream); thrust::scatter(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - initial_order.begin(), + parent_token_ids2.begin(), + parent_token_ids2.end(), + initial_order2.begin(), temp_storage.begin()); + thrust::copy(rmm::exec_policy(stream), + temp_storage.begin(), + temp_storage.end(), + parent_token_ids2.begin()); + rmm::device_uvector temp_storage2(num_nodes, stream); + thrust::scatter(rmm::exec_policy(stream), + node_levels.begin(), + node_levels.end(), + initial_order2.begin(), + temp_storage2.begin()); thrust::copy( - rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin()); + rmm::exec_policy(stream), temp_storage2.begin(), temp_storage2.end(), node_levels.begin()); + CUDF_POP_RANGE(); } + CUDF_POP_RANGE(); // parent_token_ids + CUDF_PUSH_RANGE("node_ids"); + // use copy_if counting_it and do lower_bound. which is faster? rmm::device_uvector node_ids_gpu(num_tokens, stream); thrust::exclusive_scan( rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); + CUDF_POP_RANGE(); - rmm::device_uvector parent_node_ids(num_nodes, stream, mr); - auto parent_node_ids_it = thrust::make_transform_iterator( - parent_token_ids.begin(), + CUDF_PUSH_RANGE("parent_node_ids"); + thrust::transform( + rmm::exec_policy(stream), + parent_token_ids2.begin(), + parent_token_ids2.end(), + parent_token_ids2.begin(), [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; }); - auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream), - parent_node_ids_it, - parent_node_ids_it + parent_token_ids.size(), - tokens.begin(), - parent_node_ids.begin(), - is_node); - CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes, - "parent node id gather mismatch"); + CUDF_POP_RANGE(); return {std::move(node_categories), - std::move(parent_node_ids), + std::move(parent_token_ids2), std::move(node_levels), std::move(node_range_begin), std::move(node_range_end)}; From efb662193186b03f4fe553b8afb27b8a6ce4cd2a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 6 Oct 2022 17:25:55 +0530 Subject: [PATCH 03/19] reduce node_ids memory (not impacting peak memory) --- cpp/src/io/json/json_tree.cu | 72 ++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 97320c70c30..6a5fc3177a9 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -86,7 +86,6 @@ auto to_cat = [](auto v) -> std::string { }; auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; auto print_vec = [](auto const& gpu, auto const name, auto converter) { - return; auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value); for (auto const& v : cpu) printf("%3s,", converter(v).c_str()); @@ -207,9 +206,9 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_PUSH_RANGE("num_nodes"); auto num_tokens = tokens.size(); - auto is_node_it = thrust::make_transform_iterator( - tokens.begin(), - [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); + // auto is_node_it = thrust::make_transform_iterator( + // tokens.begin(), + // [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); auto num_nodes = thrust::count_if( rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); CUDF_POP_RANGE(); @@ -301,21 +300,30 @@ tree_meta_t get_tree_representation(device_span tokens, : -1; // -1, not sentinel used here because of max operation below }); + CUDF_PUSH_RANGE("copy_if"); thrust::copy_if(rmm::exec_policy(stream), prev_parent_node_it, prev_parent_node_it + num_tokens, tokens.begin(), parent_token_ids2.begin(), is_node); + CUDF_POP_RANGE(); + + CUDF_PUSH_RANGE("seq"); rmm::device_uvector initial_order2(num_nodes, stream); thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end()); + CUDF_POP_RANGE(); + + CUDF_PUSH_RANGE("stable_sort"); auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data()); // Uses radix sort for builtin types. thrust::stable_sort_by_key(rmm::exec_policy(stream), node_levels.data(), node_levels.data() + node_levels.size(), out_pid2); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("scan"); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), node_levels.data(), node_levels.data() + node_levels.size(), @@ -323,6 +331,7 @@ tree_meta_t get_tree_representation(device_span tokens, parent_token_ids2.data(), thrust::equal_to{}, thrust::maximum{}); + CUDF_POP_RANGE(); // thrust::sort_by_key(rmm::exec_policy(stream), // initial_order2.data(), // initial_order2.data() + initial_order2.size(), @@ -356,20 +365,53 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_PUSH_RANGE("node_ids"); // use copy_if counting_it and do lower_bound. which is faster? - rmm::device_uvector node_ids_gpu(num_tokens, stream); - thrust::exclusive_scan( - rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); + // rmm::device_uvector node_ids_gpu(num_tokens, stream); + // thrust::exclusive_scan( + // rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); + + rmm::device_uvector node_ids_gpu3(num_nodes, + stream); // TODO reuse initial_order memory. + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + tokens.begin(), + node_ids_gpu3.begin(), + is_node); + CUDF_POP_RANGE(); + // rmm::device_uvector node_ids_gpu2(num_tokens, stream); + // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end()); + // print_vec(node_ids_gpu2, "token_id_gpu", to_int); + // print_vec(tokens, "tokens", to_token_str); + // print_vec(node_ids_gpu, "node_ids_gpu", to_int); + // print_vec(parent_token_ids2, "parent_token_ids2", to_int); + // print_vec(node_categories, "node_categories", to_cat); + // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int); CUDF_PUSH_RANGE("parent_node_ids"); - thrust::transform( - rmm::exec_policy(stream), - parent_token_ids2.begin(), - parent_token_ids2.end(), - parent_token_ids2.begin(), - [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { - return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; - }); + // rmm::device_uvector parent_node_ids2(num_nodes, stream); + thrust::transform(rmm::exec_policy(stream), + parent_token_ids2.begin(), + parent_token_ids2.end(), + parent_token_ids2.begin(), + [node_ids_gpu = node_ids_gpu3.begin(), + num_nodes] __device__(size_type const pid) -> NodeIndexT { + return pid < 0 ? parent_node_sentinel + : thrust::lower_bound( + thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + }); + // print_vec(parent_node_ids2, "parent_node_ids2", to_int); + + // thrust::transform( + // rmm::exec_policy(stream), + // parent_token_ids2.begin(), + // parent_token_ids2.end(), + // parent_token_ids2.begin(), + // [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { + // return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; + // }); + // print_vec(parent_token_ids2, "parent_token_ids2", to_int); CUDF_POP_RANGE(); return {std::move(node_categories), From 5f250cb1bd7c07ba3d668dfea6e256b4090f963b Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 6 Oct 2022 17:38:01 +0530 Subject: [PATCH 04/19] reorder node_range, node_cat, scope limit token_levels reduce peak memory usage (not total memory used) reorder node_range, node_cat, scope limit token_levels 10.957 GiB -> 9.91 GiB -> 9.774 GiB -> 9.403 GiB --- cpp/src/io/json/json_tree.cu | 110 ++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 6a5fc3177a9..8c980de8774 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -213,63 +213,28 @@ tree_meta_t get_tree_representation(device_span tokens, rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("node_categories"); - // Node categories: copy_if with transform. - rmm::device_uvector node_categories(num_nodes, stream, mr); - auto node_categories_it = - thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{}); - auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream), - tokens.begin(), - tokens.begin() + num_tokens, - node_categories_it, - is_node); - CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, - "node category count mismatch"); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("token_levels"); - // Node levels: transform_exclusive_scan, copy_if. - rmm::device_uvector token_levels(num_tokens, stream); - auto push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { - return does_push(token) - does_pop(token); - }); - thrust::exclusive_scan( - rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); - CUDF_POP_RANGE(); - - CUDF_PUSH_RANGE("node_levels"); rmm::device_uvector node_levels(num_nodes, stream, mr); - auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), - token_levels.begin(), - token_levels.begin() + num_tokens, - tokens.begin(), - node_levels.begin(), - is_node); - CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); - CUDF_POP_RANGE(); - - CUDF_PUSH_RANGE("node_ranges"); - // Node ranges: copy_if with transform. - rmm::device_uvector node_range_begin(num_nodes, stream, mr); - rmm::device_uvector node_range_end(num_nodes, stream, mr); - auto node_range_tuple_it = - thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - // Whether the tokenizer stage should keep quote characters for string values - // If the tokenizer keeps the quote characters, they may be stripped during type casting - constexpr bool include_quote_char = true; - auto node_range_out_it = thrust::make_transform_output_iterator( - node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char}); + { + // Node levels: transform_exclusive_scan, copy_if. + rmm::device_uvector token_levels(num_tokens, stream); + auto push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan( + rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); + CUDF_POP_RANGE(); - auto node_range_out_end = - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_tokens, - node_range_out_it, - [is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool { - return is_node(tokens_gpu[i]); - }); - CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + CUDF_PUSH_RANGE("node_levels"); + auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), + token_levels.begin(), + token_levels.begin() + num_tokens, + tokens.begin(), + node_levels.begin(), + is_node); + CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); + } CUDF_POP_RANGE(); CUDF_PUSH_RANGE("parent_token_ids"); @@ -414,6 +379,43 @@ tree_meta_t get_tree_representation(device_span tokens, // print_vec(parent_token_ids2, "parent_token_ids2", to_int); CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("node_categories"); + // Node categories: copy_if with transform. + rmm::device_uvector node_categories(num_nodes, stream, mr); + auto node_categories_it = + thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{}); + auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream), + tokens.begin(), + tokens.begin() + num_tokens, + node_categories_it, + is_node); + CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, + "node category count mismatch"); + CUDF_POP_RANGE(); + + CUDF_PUSH_RANGE("node_ranges"); + // Node ranges: copy_if with transform. + rmm::device_uvector node_range_begin(num_nodes, stream, mr); + rmm::device_uvector node_range_end(num_nodes, stream, mr); + auto node_range_tuple_it = + thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_char = true; + auto node_range_out_it = thrust::make_transform_output_iterator( + node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char}); + + auto node_range_out_end = + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + node_range_out_it, + [is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool { + return is_node(tokens_gpu[i]); + }); + CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); + CUDF_POP_RANGE(); + return {std::move(node_categories), std::move(parent_token_ids2), std::move(node_levels), From 49cb0d7ee7478e233c5eb64dc53087242bb53242 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 00:51:15 +0530 Subject: [PATCH 05/19] use cub SortPairs to reduce memory 9.403 GiB to 8.487 GiB (for 1GB json input) --- cpp/src/io/json/json_tree.cu | 108 ++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 47 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 8c980de8774..78914264020 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -29,6 +29,8 @@ #include +#include + #include #include #include @@ -274,58 +276,70 @@ tree_meta_t get_tree_representation(device_span tokens, is_node); CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("seq"); - rmm::device_uvector initial_order2(num_nodes, stream); - thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end()); - CUDF_POP_RANGE(); - - CUDF_PUSH_RANGE("stable_sort"); - auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data()); - // Uses radix sort for builtin types. - thrust::stable_sort_by_key(rmm::exec_policy(stream), - node_levels.data(), - node_levels.data() + node_levels.size(), - out_pid2); - CUDF_POP_RANGE(); - - CUDF_PUSH_RANGE("scan"); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - node_levels.data(), - node_levels.data() + node_levels.size(), - parent_token_ids2.data(), - parent_token_ids2.data(), - thrust::equal_to{}, - thrust::maximum{}); - CUDF_POP_RANGE(); - // thrust::sort_by_key(rmm::exec_policy(stream), - // initial_order2.data(), - // initial_order2.data() + initial_order2.size(), - // thrust::make_zip_iterator(parent_token_ids2.data(), - // node_levels.data())); - - // scatter to restore the original order. { + auto [node_levels1, initial_order1] = [&]() { + // Uses stable radix sort for builtin types. + CUDF_PUSH_RANGE("cub-sort"); + CUDF_PUSH_RANGE("seq"); + rmm::device_uvector initial_order2(num_nodes, stream); + thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end()); + CUDF_POP_RANGE(); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + CUDF_PUSH_RANGE("cub-out"); + rmm::device_uvector node_levels1(num_nodes, stream, mr); + rmm::device_uvector initial_order1(num_nodes, stream); + cub::DeviceRadixSort::SortPairs(nullptr, + temp_storage_bytes, + node_levels.data(), + node_levels1.data(), + initial_order2.data(), + initial_order1.data(), + num_nodes); + CUDF_PUSH_RANGE("cub-temp"); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), + temp_storage_bytes, + node_levels.data(), + node_levels1.data(), + initial_order2.data(), + initial_order1.data(), + num_nodes); + CUDF_POP_RANGE(); + CUDF_POP_RANGE(); + CUDF_POP_RANGE(); + return std::pair{std::move(node_levels1), std::move(initial_order1)}; + }(); + // gather. additional memory!!!? + CUDF_PUSH_RANGE("scan"); + rmm::device_uvector parent_token_ids1(num_nodes, stream); + thrust::gather(rmm::exec_policy(stream), + initial_order1.begin(), + initial_order1.end(), + parent_token_ids2.begin(), + parent_token_ids1.begin()); + CUDF_POP_RANGE(); + + CUDF_PUSH_RANGE("scan"); + thrust::inclusive_scan_by_key(rmm::exec_policy(stream), + node_levels1.data(), + node_levels1.data() + num_nodes, + parent_token_ids1.data(), + parent_token_ids1.data(), + thrust::equal_to{}, + thrust::maximum{}); + CUDF_POP_RANGE(); CUDF_PUSH_RANGE("scatter"); - rmm::device_uvector temp_storage(num_nodes, stream); + // scatter to restore the original order. thrust::scatter(rmm::exec_policy(stream), - parent_token_ids2.begin(), - parent_token_ids2.end(), - initial_order2.begin(), - temp_storage.begin()); - thrust::copy(rmm::exec_policy(stream), - temp_storage.begin(), - temp_storage.end(), - parent_token_ids2.begin()); - rmm::device_uvector temp_storage2(num_nodes, stream); - thrust::scatter(rmm::exec_policy(stream), - node_levels.begin(), - node_levels.end(), - initial_order2.begin(), - temp_storage2.begin()); - thrust::copy( - rmm::exec_policy(stream), temp_storage2.begin(), temp_storage2.end(), node_levels.begin()); + parent_token_ids1.begin(), + parent_token_ids1.end(), + initial_order1.begin(), + parent_token_ids2.begin()); CUDF_POP_RANGE(); } + CUDF_POP_RANGE(); // parent_token_ids CUDF_PUSH_RANGE("node_ids"); From 02a7b5bbc6d6dd4a5fb102d6bcb36972ec0e45b1 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 02:21:11 +0530 Subject: [PATCH 06/19] reduce memory by cub::DoubleBuffer, scope limit token_id_for_nodes --- cpp/src/io/json/json_tree.cu | 133 ++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 66 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 78914264020..ff07380be37 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -288,31 +288,28 @@ tree_meta_t get_tree_representation(device_span tokens, // Determine temporary device storage requirements size_t temp_storage_bytes = 0; CUDF_PUSH_RANGE("cub-out"); - rmm::device_uvector node_levels1(num_nodes, stream, mr); + rmm::device_uvector node_levels2(node_levels, stream); + rmm::device_uvector node_levels1(num_nodes, stream); rmm::device_uvector initial_order1(num_nodes, stream); - cub::DeviceRadixSort::SortPairs(nullptr, - temp_storage_bytes, - node_levels.data(), - node_levels1.data(), - initial_order2.data(), - initial_order1.data(), - num_nodes); + cub::DoubleBuffer order(initial_order2.data(), initial_order1.data()); + cub::DoubleBuffer levels(node_levels2.data(), node_levels1.data()); + cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, levels, order, num_nodes); CUDF_PUSH_RANGE("cub-temp"); + // std::cout<<"temp_storage_bytes: "< parent_token_ids1(num_nodes, stream); thrust::gather(rmm::exec_policy(stream), initial_order1.begin(), @@ -321,7 +318,7 @@ tree_meta_t get_tree_representation(device_span tokens, parent_token_ids1.begin()); CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("scan"); + CUDF_PUSH_RANGE("inscan"); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), node_levels1.data(), node_levels1.data() + num_nodes, @@ -338,60 +335,64 @@ tree_meta_t get_tree_representation(device_span tokens, initial_order1.begin(), parent_token_ids2.begin()); CUDF_POP_RANGE(); + CUDF_POP_RANGE(); } CUDF_POP_RANGE(); // parent_token_ids - CUDF_PUSH_RANGE("node_ids"); - // use copy_if counting_it and do lower_bound. which is faster? - // rmm::device_uvector node_ids_gpu(num_tokens, stream); - // thrust::exclusive_scan( - // rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); - - rmm::device_uvector node_ids_gpu3(num_nodes, - stream); // TODO reuse initial_order memory. - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_tokens, - tokens.begin(), - node_ids_gpu3.begin(), - is_node); + { + CUDF_PUSH_RANGE("node_ids"); + // use copy_if counting_it and do lower_bound. which is faster? + // rmm::device_uvector node_ids_gpu(num_tokens, stream); + // thrust::exclusive_scan( + // rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); + + rmm::device_uvector node_ids_gpu3(num_nodes, + stream); // TODO reuse initial_order memory. + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + tokens.begin(), + node_ids_gpu3.begin(), + is_node); - CUDF_POP_RANGE(); - // rmm::device_uvector node_ids_gpu2(num_tokens, stream); - // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end()); - // print_vec(node_ids_gpu2, "token_id_gpu", to_int); - // print_vec(tokens, "tokens", to_token_str); - // print_vec(node_ids_gpu, "node_ids_gpu", to_int); - // print_vec(parent_token_ids2, "parent_token_ids2", to_int); - // print_vec(node_categories, "node_categories", to_cat); - // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int); - - CUDF_PUSH_RANGE("parent_node_ids"); - // rmm::device_uvector parent_node_ids2(num_nodes, stream); - thrust::transform(rmm::exec_policy(stream), - parent_token_ids2.begin(), - parent_token_ids2.end(), - parent_token_ids2.begin(), - [node_ids_gpu = node_ids_gpu3.begin(), - num_nodes] __device__(size_type const pid) -> NodeIndexT { - return pid < 0 ? parent_node_sentinel - : thrust::lower_bound( - thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - - node_ids_gpu; - }); - // print_vec(parent_node_ids2, "parent_node_ids2", to_int); - - // thrust::transform( - // rmm::exec_policy(stream), - // parent_token_ids2.begin(), - // parent_token_ids2.end(), - // parent_token_ids2.begin(), - // [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { - // return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; - // }); - // print_vec(parent_token_ids2, "parent_token_ids2", to_int); - CUDF_POP_RANGE(); + CUDF_POP_RANGE(); + // rmm::device_uvector node_ids_gpu2(num_tokens, stream); + // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end()); + // print_vec(node_ids_gpu2, "token_id_gpu", to_int); + // print_vec(tokens, "tokens", to_token_str); + // print_vec(node_ids_gpu, "node_ids_gpu", to_int); + // print_vec(parent_token_ids2, "parent_token_ids2", to_int); + // print_vec(node_categories, "node_categories", to_cat); + // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int); + + CUDF_PUSH_RANGE("parent_node_ids"); + // rmm::device_uvector parent_node_ids2(num_nodes, stream); + thrust::transform(rmm::exec_policy(stream), + parent_token_ids2.begin(), + parent_token_ids2.end(), + parent_token_ids2.begin(), + [node_ids_gpu = node_ids_gpu3.begin(), + num_nodes] __device__(size_type const pid) -> NodeIndexT { + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound( + thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + }); + // print_vec(parent_node_ids2, "parent_node_ids2", to_int); + + // thrust::transform( + // rmm::exec_policy(stream), + // parent_token_ids2.begin(), + // parent_token_ids2.end(), + // parent_token_ids2.begin(), + // [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { + // return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; + // }); + // print_vec(parent_token_ids2, "parent_token_ids2", to_int); + CUDF_POP_RANGE(); + } CUDF_PUSH_RANGE("node_categories"); // Node categories: copy_if with transform. From 9243d893b52fdda12443ef24c63f487e5d4aad96 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 14:06:08 +0530 Subject: [PATCH 07/19] cleanup --- cpp/src/io/json/json_tree.cu | 49 ++++++++---------------------------- 1 file changed, 10 insertions(+), 39 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index ff07380be37..43f7cf6e81b 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -208,10 +208,7 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_PUSH_RANGE("num_nodes"); auto num_tokens = tokens.size(); - // auto is_node_it = thrust::make_transform_iterator( - // tokens.begin(), - // [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); - auto num_nodes = thrust::count_if( + auto num_nodes = thrust::count_if( rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); CUDF_POP_RANGE(); @@ -240,17 +237,16 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_POP_RANGE(); CUDF_PUSH_RANGE("parent_token_ids"); - // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max, - // reorder, copy_if. This one is sort of logical stack. But more generalized. + // Node parent ids: + // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. + // This one is sort of logical stack. But more generalized. // TODO: make it own function. - // TODO re-write the algorithm to work only on nodes, not tokens. - // // ### only push nodes matter for scan! (verify throughly if true. For i-1 == FE, then i-3 - // matters. or i-2 because FB and SMB treated same. make sure nodeid matches right for FB/SMB. - // // now copy only push operations to seperate array with token_levels, & node_id. - // // sort by level, then scan it. then scatter to node_id positions. - // // then another scan for non-push nodes? or another scan before scatter? L/S SMB FE. - // // total memory: num_nodes*(4b+b+4b) <= 9b*num_nodes. + // previous push node_id + // if previous node is a push, then i-1 + // if previous node is FE, then i-2 + // if previous node is SMB and its previous node is a push, then i-2 + // else -1 rmm::device_uvector parent_token_ids2(num_nodes, stream); auto prev_parent_node_it = thrust::make_transform_iterator( thrust::make_counting_iterator(0), @@ -342,13 +338,7 @@ tree_meta_t get_tree_representation(device_span tokens, { CUDF_PUSH_RANGE("node_ids"); - // use copy_if counting_it and do lower_bound. which is faster? - // rmm::device_uvector node_ids_gpu(num_tokens, stream); - // thrust::exclusive_scan( - // rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); - - rmm::device_uvector node_ids_gpu3(num_nodes, - stream); // TODO reuse initial_order memory. + rmm::device_uvector node_ids_gpu3(num_nodes, stream); thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_tokens, @@ -357,17 +347,8 @@ tree_meta_t get_tree_representation(device_span tokens, is_node); CUDF_POP_RANGE(); - // rmm::device_uvector node_ids_gpu2(num_tokens, stream); - // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end()); - // print_vec(node_ids_gpu2, "token_id_gpu", to_int); - // print_vec(tokens, "tokens", to_token_str); - // print_vec(node_ids_gpu, "node_ids_gpu", to_int); - // print_vec(parent_token_ids2, "parent_token_ids2", to_int); - // print_vec(node_categories, "node_categories", to_cat); - // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int); CUDF_PUSH_RANGE("parent_node_ids"); - // rmm::device_uvector parent_node_ids2(num_nodes, stream); thrust::transform(rmm::exec_policy(stream), parent_token_ids2.begin(), parent_token_ids2.end(), @@ -381,16 +362,6 @@ tree_meta_t get_tree_representation(device_span tokens, node_ids_gpu; }); // print_vec(parent_node_ids2, "parent_node_ids2", to_int); - - // thrust::transform( - // rmm::exec_policy(stream), - // parent_token_ids2.begin(), - // parent_token_ids2.end(), - // parent_token_ids2.begin(), - // [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { - // return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; - // }); - // print_vec(parent_token_ids2, "parent_token_ids2", to_int); CUDF_POP_RANGE(); } From 7efc890555061cad398154f54b2e563eafcd9e43 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 16:24:45 +0530 Subject: [PATCH 08/19] reorganize parent_node_ids algorithm (generic logical stack) --- cpp/src/io/json/json_tree.cu | 235 ++++++++++++++++++----------------- 1 file changed, 120 insertions(+), 115 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 43f7cf6e81b..428a295706a 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -166,6 +167,74 @@ struct node_ranges { } }; +/** + * @brief Returns stable sorted key and its sorted order + * + * @tparam IndexType sorted order type + * @tparam KeyType key type + * @param key key to sort + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return A pair of sorted key and its sorted order + */ +template +std::pair, rmm::device_uvector> stable_sorted_key_order( + cudf::device_span key, rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + // Uses stable radix sort for builtin types. + + // Determine temporary device storage requirements + rmm::device_uvector key1(key.size(), stream); + rmm::device_uvector key2(key.size(), stream); + rmm::device_uvector order1(key.size(), stream); + rmm::device_uvector order2(key.size(), stream); + cub::DoubleBuffer order(order1.data(), order2.data()); + cub::DoubleBuffer key_buffer(key1.data(), key2.data()); + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_PUSH_RANGE("copy"); + thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin()); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("seq"); + thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end()); + CUDF_POP_RANGE(); + CUDF_PUSH_RANGE("cub-sort"); + cub::DeviceRadixSort::SortPairs( + d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size()); + CUDF_POP_RANGE(); + return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2), + order.Current() == order1.data() ? std::move(order1) : std::move(order2)}; +} + +/** + * @brief Propagate parent node to siblings from first sibling. + * + * @param node_levels Node levels of each node + * @param parent_node_ids parent node ids initialized for first child of each push node, + * and other siblings are initialized to -1. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +void propagate_parent_to_siblings(cudf::device_span node_levels, + cudf::device_span parent_node_ids, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); + // instead of gather, using permutation_iterator, which is ~17% faster + + CUDF_PUSH_RANGE("inscan"); + thrust::inclusive_scan_by_key( + rmm::exec_policy(stream), + sorted_node_levels.begin(), + sorted_node_levels.end(), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::equal_to{}, + thrust::maximum{}); + CUDF_POP_RANGE(); +} + // Generates a tree representation of the given tokens, token_indices. tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, @@ -218,7 +287,7 @@ tree_meta_t get_tree_representation(device_span tokens, // Node levels: transform_exclusive_scan, copy_if. rmm::device_uvector token_levels(num_tokens, stream); auto push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> TreeDepthT { return does_push(token) - does_pop(token); }); thrust::exclusive_scan( @@ -242,128 +311,64 @@ tree_meta_t get_tree_representation(device_span tokens, // This one is sort of logical stack. But more generalized. // TODO: make it own function. - // previous push node_id - // if previous node is a push, then i-1 - // if previous node is FE, then i-2 - // if previous node is SMB and its previous node is a push, then i-2 - // else -1 - rmm::device_uvector parent_token_ids2(num_nodes, stream); - auto prev_parent_node_it = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { - return (i > 0) ? ((tokens_gpu[i - 1] == token_t::StructBegin || - tokens_gpu[i - 1] == token_t::ListBegin) - ? i - 1 - : ((tokens_gpu[i - 1] == token_t::FieldNameEnd || - (tokens_gpu[i - 1] == token_t::StructMemberBegin && - (tokens_gpu[i - 2] == token_t::StructBegin || - tokens_gpu[i - 2] == token_t::ListBegin))) - ? i - 2 - : -1)) - : -1; - // -1, not sentinel used here because of max operation below - }); - CUDF_PUSH_RANGE("copy_if"); - thrust::copy_if(rmm::exec_policy(stream), - prev_parent_node_it, - prev_parent_node_it + num_tokens, - tokens.begin(), - parent_token_ids2.begin(), - is_node); - CUDF_POP_RANGE(); - - { - auto [node_levels1, initial_order1] = [&]() { - // Uses stable radix sort for builtin types. - CUDF_PUSH_RANGE("cub-sort"); - CUDF_PUSH_RANGE("seq"); - rmm::device_uvector initial_order2(num_nodes, stream); - thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end()); - CUDF_POP_RANGE(); - - // Determine temporary device storage requirements - size_t temp_storage_bytes = 0; - CUDF_PUSH_RANGE("cub-out"); - rmm::device_uvector node_levels2(node_levels, stream); - rmm::device_uvector node_levels1(num_nodes, stream); - rmm::device_uvector initial_order1(num_nodes, stream); - cub::DoubleBuffer order(initial_order2.data(), initial_order1.data()); - cub::DoubleBuffer levels(node_levels2.data(), node_levels1.data()); - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, levels, order, num_nodes); - CUDF_PUSH_RANGE("cub-temp"); - // std::cout<<"temp_storage_bytes: "< parent_token_ids1(num_nodes, stream); - thrust::gather(rmm::exec_policy(stream), - initial_order1.begin(), - initial_order1.end(), - parent_token_ids2.begin(), - parent_token_ids1.begin()); - CUDF_POP_RANGE(); - - CUDF_PUSH_RANGE("inscan"); - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - node_levels1.data(), - node_levels1.data() + num_nodes, - parent_token_ids1.data(), - parent_token_ids1.data(), - thrust::equal_to{}, - thrust::maximum{}); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("scatter"); - // scatter to restore the original order. - thrust::scatter(rmm::exec_policy(stream), - parent_token_ids1.begin(), - parent_token_ids1.end(), - initial_order1.begin(), - parent_token_ids2.begin()); - CUDF_POP_RANGE(); - CUDF_POP_RANGE(); - } - - CUDF_POP_RANGE(); // parent_token_ids - + rmm::device_uvector parent_node_ids(num_nodes, stream); { - CUDF_PUSH_RANGE("node_ids"); - rmm::device_uvector node_ids_gpu3(num_nodes, stream); + rmm::device_uvector node_token_ids(num_nodes, stream); + CUDF_PUSH_RANGE("copy_if"); thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(0) + num_tokens, + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, tokens.begin(), - node_ids_gpu3.begin(), + node_token_ids.begin(), is_node); - CUDF_POP_RANGE(); + // previous push node_id + // if previous node is a push, then i-1 + // if previous node is FE, then i-2 + // if previous node is SMB and its previous node is a push, then i-2 + // else -1 + auto first_childs_parent_token_id = [tokens_gpu = + tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) return -1; + if (tokens_gpu[i - 1] == token_t::StructBegin || tokens_gpu[i - 1] == token_t::ListBegin) + return i - 1; + else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) + return i - 2; + else if (tokens_gpu[i - 1] == token_t::StructMemberBegin && + (tokens_gpu[i - 2] == token_t::StructBegin || + tokens_gpu[i - 2] == token_t::ListBegin)) + return i - 2; + else + return -1; + // -1 is not sentinel, is required because of max operation below + }; + CUDF_PUSH_RANGE("parent_node_ids"); - thrust::transform(rmm::exec_policy(stream), - parent_token_ids2.begin(), - parent_token_ids2.end(), - parent_token_ids2.begin(), - [node_ids_gpu = node_ids_gpu3.begin(), - num_nodes] __device__(size_type const pid) -> NodeIndexT { - return pid < 0 - ? parent_node_sentinel - : thrust::lower_bound( - thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - - node_ids_gpu; - }); - // print_vec(parent_node_ids2, "parent_node_ids2", to_int); + thrust::transform( + rmm::exec_policy(stream), + node_token_ids.begin(), + node_token_ids.end(), + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto pid = first_childs_parent_token_id(tid); + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + // TODO node id will be 1 or 2 nodes back. It could be faster than binary search. + }); + // print_vec(parent_node_ids, "parent_node_ids", to_int); CUDF_POP_RANGE(); } + // Propagate parent node to siblings from first sibling - inplace. + propagate_parent_to_siblings( + cudf::device_span{node_levels.data(), node_levels.size()}, + parent_node_ids, + stream); + + CUDF_POP_RANGE(); // parent_token_ids CUDF_PUSH_RANGE("node_categories"); // Node categories: copy_if with transform. @@ -403,7 +408,7 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_POP_RANGE(); return {std::move(node_categories), - std::move(parent_token_ids2), + std::move(parent_node_ids), std::move(node_levels), std::move(node_range_begin), std::move(node_range_end)}; From 6d3a1663fbbc1f33de6adf2e1f2801c3d8568f9d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 16:55:59 +0530 Subject: [PATCH 09/19] include CUDF_PUSH_RANGE, CUDF_POP_RANGE nvtx macros --- cpp/src/io/json/json_tree.cu | 1 - cpp/src/io/json/nested_json.hpp | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 428a295706a..e9e6ae64bd0 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -357,7 +357,6 @@ tree_meta_t get_tree_representation(device_span tokens, ? parent_node_sentinel : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - node_ids_gpu; - // TODO node id will be 1 or 2 nodes back. It could be faster than binary search. }); // print_vec(parent_node_ids, "parent_node_ids", to_int); CUDF_POP_RANGE(); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 10d209b2ea6..90b679265fb 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -367,3 +367,17 @@ table_with_metadata host_parse_nested_json( } // namespace detail } // namespace cudf::io::json + +#define _CONCAT_(x, y) x##y +#define CONCAT(x, y) _CONCAT_(x, y) + +#define NVTX3_PUSH_RANGE_IN(D, tag) \ + ::nvtx3::registered_message const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \ + ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__, \ + __LINE__){CONCAT(nvtx3_range_name__, __LINE__)}; \ + nvtxDomainRangePushEx(::nvtx3::domain::get(), CONCAT(nvtx3_range_attr__, __LINE__).get()); + +#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get()); + +#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag) +#define CUDF_POP_RANGE() NVTX3_POP_RANGE(cudf::libcudf_domain) From bbcbffa7ce5bd35395782db8c98e80cd8b46f5f0 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:04:21 +0530 Subject: [PATCH 10/19] replace TreeDepthT with size_type due to cuda Invalid Device function error --- cpp/src/io/json/json_tree.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index e9e6ae64bd0..0c06320a6dc 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -287,7 +287,7 @@ tree_meta_t get_tree_representation(device_span tokens, // Node levels: transform_exclusive_scan, copy_if. rmm::device_uvector token_levels(num_tokens, stream); auto push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> TreeDepthT { + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { return does_push(token) - does_pop(token); }); thrust::exclusive_scan( From f9f0926c801d5708630d26558aa97428c5d2744c Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:19:01 +0530 Subject: [PATCH 11/19] update docs --- cpp/src/io/json/json_tree.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 0c06320a6dc..5a9b3ca18b3 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -170,6 +170,8 @@ struct node_ranges { /** * @brief Returns stable sorted key and its sorted order * + * Uses cub stable radix sort + * * @tparam IndexType sorted order type * @tparam KeyType key type * @param key key to sort @@ -181,7 +183,6 @@ std::pair, rmm::device_uvector> stable_s cudf::device_span key, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); - // Uses stable radix sort for builtin types. // Determine temporary device storage requirements rmm::device_uvector key1(key.size(), stream); @@ -308,7 +309,7 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_PUSH_RANGE("parent_token_ids"); // Node parent ids: // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. - // This one is sort of logical stack. But more generalized. + // This algorithms si is more generalized logical stack. // TODO: make it own function. rmm::device_uvector parent_node_ids(num_nodes, stream); @@ -341,7 +342,6 @@ tree_meta_t get_tree_representation(device_span tokens, return i - 2; else return -1; - // -1 is not sentinel, is required because of max operation below }; CUDF_PUSH_RANGE("parent_node_ids"); @@ -357,6 +357,7 @@ tree_meta_t get_tree_representation(device_span tokens, ? parent_node_sentinel : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - node_ids_gpu; + // parent_node_sentinel is -1, useful for segmented max operation below }); // print_vec(parent_node_ids, "parent_node_ids", to_int); CUDF_POP_RANGE(); From f851232bc3e7eecd81e71020da8f5c3e27b3c441 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:27:54 +0530 Subject: [PATCH 12/19] remove nvtx range macros and debug prints --- cpp/src/io/json/json_tree.cu | 75 +++--------------------------------- 1 file changed, 6 insertions(+), 69 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 5a9b3ca18b3..6a94ea05e81 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -56,45 +56,6 @@ namespace cudf::io::json { namespace detail { -// DEBUG print -[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string { - switch (token) { - case token_t::StructBegin: return " {"; - case token_t::StructEnd: return " }"; - case token_t::ListBegin: return " ["; - case token_t::ListEnd: return " ]"; - case token_t::FieldNameBegin: return "FB"; - case token_t::FieldNameEnd: return "FE"; - case token_t::StringBegin: return "SB"; - case token_t::StringEnd: return "SE"; - case token_t::ErrorBegin: return "er"; - case token_t::ValueBegin: return "VB"; - case token_t::ValueEnd: return "VE"; - case token_t::StructMemberBegin: return " <"; - case token_t::StructMemberEnd: return " >"; - default: return "."; - } -}; -// DEBUG prints -auto to_cat = [](auto v) -> std::string { - switch (v) { - case NC_STRUCT: return " S"; - case NC_LIST: return " L"; - case NC_STR: return " \""; - case NC_VAL: return " V"; - case NC_FN: return " F"; - case NC_ERR: return "ER"; - default: return "UN"; - }; -}; -auto to_int = [](auto v) { return std::to_string(static_cast(v)); }; -auto print_vec = [](auto const& gpu, auto const name, auto converter) { - auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value); - for (auto const& v : cpu) - printf("%3s,", converter(v).c_str()); - std::cout << name << std::endl; -}; - // The node that a token represents struct token_to_node { __device__ auto operator()(PdaTokenT const token) -> NodeT @@ -194,16 +155,13 @@ std::pair, rmm::device_uvector> stable_s size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - CUDF_PUSH_RANGE("copy"); + thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin()); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("seq"); thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end()); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("cub-sort"); + cub::DeviceRadixSort::SortPairs( d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size()); - CUDF_POP_RANGE(); + return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2), order.Current() == order1.data() ? std::move(order1) : std::move(order2)}; } @@ -224,7 +182,6 @@ void propagate_parent_to_siblings(cudf::device_span node_level auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); // instead of gather, using permutation_iterator, which is ~17% faster - CUDF_PUSH_RANGE("inscan"); thrust::inclusive_scan_by_key( rmm::exec_policy(stream), sorted_node_levels.begin(), @@ -233,7 +190,6 @@ void propagate_parent_to_siblings(cudf::device_span node_level thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), thrust::equal_to{}, thrust::maximum{}); - CUDF_POP_RANGE(); } // Generates a tree representation of the given tokens, token_indices. @@ -276,16 +232,13 @@ tree_meta_t get_tree_representation(device_span tokens, }; }; - CUDF_PUSH_RANGE("num_nodes"); auto num_tokens = tokens.size(); auto num_nodes = thrust::count_if( rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("token_levels"); + // Node levels: transform_exclusive_scan, copy_if. rmm::device_uvector node_levels(num_nodes, stream, mr); { - // Node levels: transform_exclusive_scan, copy_if. rmm::device_uvector token_levels(num_tokens, stream); auto push_pop_it = thrust::make_transform_iterator( tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { @@ -293,9 +246,7 @@ tree_meta_t get_tree_representation(device_span tokens, }); thrust::exclusive_scan( rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("node_levels"); auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), token_levels.begin(), token_levels.begin() + num_tokens, @@ -304,25 +255,20 @@ tree_meta_t get_tree_representation(device_span tokens, is_node); CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); } - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("parent_token_ids"); // Node parent ids: // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. - // This algorithms si is more generalized logical stack. + // This algorithms is a more generalized logical stack. // TODO: make it own function. - - rmm::device_uvector parent_node_ids(num_nodes, stream); + rmm::device_uvector parent_node_ids(num_nodes, stream, mr); { rmm::device_uvector node_token_ids(num_nodes, stream); - CUDF_PUSH_RANGE("copy_if"); thrust::copy_if(rmm::exec_policy(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + num_tokens, tokens.begin(), node_token_ids.begin(), is_node); - CUDF_POP_RANGE(); // previous push node_id // if previous node is a push, then i-1 @@ -344,7 +290,6 @@ tree_meta_t get_tree_representation(device_span tokens, return -1; }; - CUDF_PUSH_RANGE("parent_node_ids"); thrust::transform( rmm::exec_policy(stream), node_token_ids.begin(), @@ -359,8 +304,6 @@ tree_meta_t get_tree_representation(device_span tokens, node_ids_gpu; // parent_node_sentinel is -1, useful for segmented max operation below }); - // print_vec(parent_node_ids, "parent_node_ids", to_int); - CUDF_POP_RANGE(); } // Propagate parent node to siblings from first sibling - inplace. propagate_parent_to_siblings( @@ -368,9 +311,6 @@ tree_meta_t get_tree_representation(device_span tokens, parent_node_ids, stream); - CUDF_POP_RANGE(); // parent_token_ids - - CUDF_PUSH_RANGE("node_categories"); // Node categories: copy_if with transform. rmm::device_uvector node_categories(num_nodes, stream, mr); auto node_categories_it = @@ -382,9 +322,7 @@ tree_meta_t get_tree_representation(device_span tokens, is_node); CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, "node category count mismatch"); - CUDF_POP_RANGE(); - CUDF_PUSH_RANGE("node_ranges"); // Node ranges: copy_if with transform. rmm::device_uvector node_range_begin(num_nodes, stream, mr); rmm::device_uvector node_range_end(num_nodes, stream, mr); @@ -405,7 +343,6 @@ tree_meta_t get_tree_representation(device_span tokens, return is_node(tokens_gpu[i]); }); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); - CUDF_POP_RANGE(); return {std::move(node_categories), std::move(parent_node_ids), From 55369c9c5af7579617f9853aeec642c082789d63 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:46:56 +0530 Subject: [PATCH 13/19] remove nvtx macros --- cpp/src/io/json/nested_json.hpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 90b679265fb..10d209b2ea6 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -367,17 +367,3 @@ table_with_metadata host_parse_nested_json( } // namespace detail } // namespace cudf::io::json - -#define _CONCAT_(x, y) x##y -#define CONCAT(x, y) _CONCAT_(x, y) - -#define NVTX3_PUSH_RANGE_IN(D, tag) \ - ::nvtx3::registered_message const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \ - ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__, \ - __LINE__){CONCAT(nvtx3_range_name__, __LINE__)}; \ - nvtxDomainRangePushEx(::nvtx3::domain::get(), CONCAT(nvtx3_range_attr__, __LINE__).get()); - -#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get()); - -#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag) -#define CUDF_POP_RANGE() NVTX3_POP_RANGE(cudf::libcudf_domain) From 5eefd64b2370b6f4b04f1af318f85f6ec247a3d4 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:47:17 +0530 Subject: [PATCH 14/19] NVTX RANGES macros commit --- cpp/src/io/json/nested_json.hpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 10d209b2ea6..34a071d3359 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -367,3 +367,27 @@ table_with_metadata host_parse_nested_json( } // namespace detail } // namespace cudf::io::json + +#define _CONCAT_(x, y) x##y +#define CONCAT(x, y) _CONCAT_(x, y) + +#define NVTX3_PUSH_RANGE_IN(D, tag) \ + ::nvtx3::registered_message const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \ + ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__, \ + __LINE__){CONCAT(nvtx3_range_name__, __LINE__)}; \ + nvtxDomainRangePushEx(::nvtx3::domain::get(), CONCAT(nvtx3_range_attr__, __LINE__).get()); + +#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get()); + +#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag) +#define CUDF_POP_RANGE() NVTX3_POP_RANGE(cudf::libcudf_domain) + +#define NVTX3_SCOPED_RANGE_IN(D, tag) \ + ::nvtx3::registered_message const CONCAT(nvtx3_scope_name__, \ + __LINE__){std::string(__func__) + "::" + tag}; \ + ::nvtx3::event_attributes const CONCAT(nvtx3_scope_attr__, \ + __LINE__){CONCAT(nvtx3_scope_name__, __LINE__)}; \ + ::nvtx3::domain_thread_range const CONCAT(nvtx3_range__, \ + __LINE__){CONCAT(nvtx3_scope_attr__, __LINE__)}; + +#define CUDF_SCOPED_RANGE(tag) NVTX3_SCOPED_RANGE_IN(cudf::libcudf_domain, tag) From 3bb54f4dbafad77d48bef7f11e509c489c453227 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 7 Oct 2022 17:48:09 +0530 Subject: [PATCH 15/19] Revert "NVTX RANGES macros commit" This reverts commit 5eefd64b2370b6f4b04f1af318f85f6ec247a3d4. --- cpp/src/io/json/nested_json.hpp | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 34a071d3359..10d209b2ea6 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -367,27 +367,3 @@ table_with_metadata host_parse_nested_json( } // namespace detail } // namespace cudf::io::json - -#define _CONCAT_(x, y) x##y -#define CONCAT(x, y) _CONCAT_(x, y) - -#define NVTX3_PUSH_RANGE_IN(D, tag) \ - ::nvtx3::registered_message const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \ - ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__, \ - __LINE__){CONCAT(nvtx3_range_name__, __LINE__)}; \ - nvtxDomainRangePushEx(::nvtx3::domain::get(), CONCAT(nvtx3_range_attr__, __LINE__).get()); - -#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get()); - -#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag) -#define CUDF_POP_RANGE() NVTX3_POP_RANGE(cudf::libcudf_domain) - -#define NVTX3_SCOPED_RANGE_IN(D, tag) \ - ::nvtx3::registered_message const CONCAT(nvtx3_scope_name__, \ - __LINE__){std::string(__func__) + "::" + tag}; \ - ::nvtx3::event_attributes const CONCAT(nvtx3_scope_attr__, \ - __LINE__){CONCAT(nvtx3_scope_name__, __LINE__)}; \ - ::nvtx3::domain_thread_range const CONCAT(nvtx3_range__, \ - __LINE__){CONCAT(nvtx3_scope_attr__, __LINE__)}; - -#define CUDF_SCOPED_RANGE(tag) NVTX3_SCOPED_RANGE_IN(cudf::libcudf_domain, tag) From 5a0a9a77f231735d683c0284d0c2af13b1b65b27 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Oct 2022 09:58:14 +0530 Subject: [PATCH 16/19] address review comments (upsj) --- cpp/src/io/json/json_tree.cu | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 6a94ea05e81..f00abae9111 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -146,12 +146,12 @@ std::pair, rmm::device_uvector> stable_s CUDF_FUNC_RANGE(); // Determine temporary device storage requirements - rmm::device_uvector key1(key.size(), stream); - rmm::device_uvector key2(key.size(), stream); + rmm::device_uvector key1(key.size(), stream); + rmm::device_uvector key2(key.size(), stream); rmm::device_uvector order1(key.size(), stream); rmm::device_uvector order2(key.size(), stream); cub::DoubleBuffer order(order1.data(), order2.data()); - cub::DoubleBuffer key_buffer(key1.data(), key2.data()); + cub::DoubleBuffer key_buffer(key1.data(), key2.data()); size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); @@ -272,8 +272,9 @@ tree_meta_t get_tree_representation(device_span tokens, // previous push node_id // if previous node is a push, then i-1 - // if previous node is FE, then i-2 + // if previous node is FE, then i-2 (returns FB's index) // if previous node is SMB and its previous node is a push, then i-2 + // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent. // else -1 auto first_childs_parent_token_id = [tokens_gpu = tokens.begin()] __device__(auto i) -> NodeIndexT { From a356ea01caf5cee10b37acafc669b4d3cfa6e6d7 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 14 Oct 2022 08:16:00 +0530 Subject: [PATCH 17/19] Apply suggestions from code review Co-authored-by: Bradley Dice --- cpp/src/io/json/json_tree.cu | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index f00abae9111..1a9d3a01db8 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -129,29 +129,29 @@ struct node_ranges { }; /** - * @brief Returns stable sorted key and its sorted order + * @brief Returns stable sorted keys and its sorted order * * Uses cub stable radix sort * * @tparam IndexType sorted order type * @tparam KeyType key type - * @param key key to sort + * @param keys keys to sort * @param stream CUDA stream used for device memory operations and kernel launches. - * @return A pair of sorted key and its sorted order + * @return Sorted keys and indices producing that sorted order */ template std::pair, rmm::device_uvector> stable_sorted_key_order( - cudf::device_span key, rmm::cuda_stream_view stream) + cudf::device_span keys, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); // Determine temporary device storage requirements - rmm::device_uvector key1(key.size(), stream); - rmm::device_uvector key2(key.size(), stream); - rmm::device_uvector order1(key.size(), stream); - rmm::device_uvector order2(key.size(), stream); - cub::DoubleBuffer order(order1.data(), order2.data()); - cub::DoubleBuffer key_buffer(key1.data(), key2.data()); + rmm::device_uvector keys_buffer1(key.size(), stream); + rmm::device_uvector keys_buffer2(key.size(), stream); + rmm::device_uvector order_buffer1(key.size(), stream); + rmm::device_uvector order_buffer2(key.size(), stream); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); @@ -253,7 +253,7 @@ tree_meta_t get_tree_representation(device_span tokens, tokens.begin(), node_levels.begin(), is_node); - CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); + CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, "node level count mismatch"); } // Node parent ids: From 7116570b1de8fa9c4e46d4945ccc67b0d2726edc Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 14 Oct 2022 08:22:43 +0530 Subject: [PATCH 18/19] address review comments --- cpp/src/io/json/json_tree.cu | 44 ++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 1a9d3a01db8..3d3fad41d76 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -146,24 +146,27 @@ std::pair, rmm::device_uvector> stable_s CUDF_FUNC_RANGE(); // Determine temporary device storage requirements - rmm::device_uvector keys_buffer1(key.size(), stream); - rmm::device_uvector keys_buffer2(key.size(), stream); - rmm::device_uvector order_buffer1(key.size(), stream); - rmm::device_uvector order_buffer2(key.size(), stream); + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size()); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin()); - thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end()); + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); cub::DeviceRadixSort::SortPairs( - d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size()); + d_temp_storage.data(), temp_storage_bytes, keys_buffer, order_buffer, keys.size()); - return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2), - order.Current() == order1.data() ? std::move(order1) : std::move(order2)}; + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; } /** @@ -253,14 +256,14 @@ tree_meta_t get_tree_representation(device_span tokens, tokens.begin(), node_levels.begin(), is_node); - CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, "node level count mismatch"); + CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, + "node level count mismatch"); } // Node parent ids: // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. - // This algorithms is a more generalized logical stack. - // TODO: make it own function. rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + // This block of code is generalized logical stack algorithm. TODO: make this a seperate function. { rmm::device_uvector node_token_ids(num_nodes, stream); thrust::copy_if(rmm::exec_policy(stream), @@ -278,17 +281,18 @@ tree_meta_t get_tree_representation(device_span tokens, // else -1 auto first_childs_parent_token_id = [tokens_gpu = tokens.begin()] __device__(auto i) -> NodeIndexT { - if (i <= 0) return -1; - if (tokens_gpu[i - 1] == token_t::StructBegin || tokens_gpu[i - 1] == token_t::ListBegin) + if (i <= 0) { return -1; } + if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) { return i - 1; - else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) + } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) { return i - 2; - else if (tokens_gpu[i - 1] == token_t::StructMemberBegin && - (tokens_gpu[i - 2] == token_t::StructBegin || - tokens_gpu[i - 2] == token_t::ListBegin)) + } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and + (tokens_gpu[i - 2] == token_t::StructBegin || + tokens_gpu[i - 2] == token_t::ListBegin)) { return i - 2; - else + } else { return -1; + } }; thrust::transform( From 8e0c85ff5c9a9ada87bb21d9327be88081fcf2f8 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 14 Oct 2022 08:53:39 +0530 Subject: [PATCH 19/19] add copy, memory savings comments for stable_sorted_key_order --- cpp/src/io/json/json_tree.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 3d3fad41d76..cf041b02a20 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -131,7 +131,9 @@ struct node_ranges { /** * @brief Returns stable sorted keys and its sorted order * - * Uses cub stable radix sort + * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. + * Since the key and order is returned, using double buffer helps to avoid extra copy to user + * provided output iterator. * * @tparam IndexType sorted order type * @tparam KeyType key type