From a75b0a5f13300f62fa7cf514651f105bd988d051 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 30 Sep 2022 13:08:16 +0530
Subject: [PATCH 01/19] fix the right condition for parent_node propagation
 initial condition

---
 cpp/src/io/json/json_tree.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index dbf026c351e..19fb8f2c980 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -235,7 +235,12 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                    parent_token_ids.begin(),
                    parent_token_ids.end(),
                    [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
-                     return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1;
+                     return (i > 0)
+                              ? ((tokens_gpu[i - 1] == token_t::StructBegin ||
+                                  tokens_gpu[i - 1] == token_t::ListBegin)
+                                   ? i - 1
+                                   : (tokens_gpu[i - 1] == token_t::FieldNameEnd ? i - 2 : -1))
+                              : -1;
                      // -1, not sentinel used here because of max operation below
                    });
 

From 4abfb5147c3e065d2d06560e94bdad725f823d6d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 3 Oct 2022 22:11:59 +0530
Subject: [PATCH 02/19] parent_node_id generation using only nodes instead of
 tokens reduces memory usage by 35% (1GB json takes 10.951GB instead of
 16.957GB)

---
 cpp/src/io/json/json_tree.cu | 167 ++++++++++++++++++++++++++---------
 1 file changed, 124 insertions(+), 43 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 19fb8f2c980..97320c70c30 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -53,6 +53,46 @@
 namespace cudf::io::json {
 namespace detail {
 
+// DEBUG print
+[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string {
+  switch (token) {
+    case token_t::StructBegin: return " {";
+    case token_t::StructEnd: return " }";
+    case token_t::ListBegin: return " [";
+    case token_t::ListEnd: return " ]";
+    case token_t::FieldNameBegin: return "FB";
+    case token_t::FieldNameEnd: return "FE";
+    case token_t::StringBegin: return "SB";
+    case token_t::StringEnd: return "SE";
+    case token_t::ErrorBegin: return "er";
+    case token_t::ValueBegin: return "VB";
+    case token_t::ValueEnd: return "VE";
+    case token_t::StructMemberBegin: return " <";
+    case token_t::StructMemberEnd: return " >";
+    default: return ".";
+  }
+};
+// DEBUG prints
+auto to_cat = [](auto v) -> std::string {
+  switch (v) {
+    case NC_STRUCT: return " S";
+    case NC_LIST: return " L";
+    case NC_STR: return " \"";
+    case NC_VAL: return " V";
+    case NC_FN: return " F";
+    case NC_ERR: return "ER";
+    default: return "UN";
+  };
+};
+auto to_int    = [](auto v) { return std::to_string(static_cast<int>(v)); };
+auto print_vec = [](auto const& gpu, auto const name, auto converter) {
+  return;
+  auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value);
+  for (auto const& v : cpu)
+    printf("%3s,", converter(v).c_str());
+  std::cout << name << std::endl;
+};
+
 // The node that a token represents
 struct token_to_node {
   __device__ auto operator()(PdaTokenT const token) -> NodeT
@@ -165,13 +205,16 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     };
   };
 
+  CUDF_PUSH_RANGE("num_nodes");
   auto num_tokens = tokens.size();
   auto is_node_it = thrust::make_transform_iterator(
     tokens.begin(),
     [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
   auto num_nodes = thrust::count_if(
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("node_categories");
   // Node categories: copy_if with transform.
   rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
   auto node_categories_it =
@@ -183,7 +226,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                              is_node);
   CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
                "node category count mismatch");
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("token_levels");
   // Node levels: transform_exclusive_scan, copy_if.
   rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
   auto push_pop_it = thrust::make_transform_iterator(
@@ -192,7 +237,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     });
   thrust::exclusive_scan(
     rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("node_levels");
   rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
   auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
                                          token_levels.begin(),
@@ -201,7 +248,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                          node_levels.begin(),
                                          is_node);
   CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("node_ranges");
   // Node ranges: copy_if with transform.
   rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
   rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
@@ -222,77 +271,109 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                       return is_node(tokens_gpu[i]);
                     });
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("parent_token_ids");
   // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
   // reorder, copy_if. This one is sort of logical stack. But more generalized.
   // TODO: make it own function.
-  rmm::device_uvector<size_type> parent_token_ids(num_tokens, stream);
-  rmm::device_uvector<size_type> initial_order(num_tokens, stream);
-  // TODO re-write the algorithm to work only on nodes, not tokens.
-
-  thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end());
-  thrust::tabulate(rmm::exec_policy(stream),
-                   parent_token_ids.begin(),
-                   parent_token_ids.end(),
-                   [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
-                     return (i > 0)
-                              ? ((tokens_gpu[i - 1] == token_t::StructBegin ||
-                                  tokens_gpu[i - 1] == token_t::ListBegin)
-                                   ? i - 1
-                                   : (tokens_gpu[i - 1] == token_t::FieldNameEnd ? i - 2 : -1))
-                              : -1;
-                     // -1, not sentinel used here because of max operation below
-                   });
 
-  auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data());
+  // TODO re-write the algorithm to work only on nodes, not tokens.
+  // // ### only push nodes matter for scan! (verify throughly if true. For i-1 == FE, then i-3
+  // matters. or i-2 because FB and SMB treated same. make sure nodeid matches right for FB/SMB.
+  // // now copy only push operations to seperate array with token_levels, & node_id.
+  // // sort by level, then scan it. then scatter to node_id positions.
+  // // then another scan for non-push nodes? or another scan before scatter? L/S SMB FE.
+  // // total memory: num_nodes*(4b+b+4b) <= 9b*num_nodes.
+  rmm::device_uvector<size_type> parent_token_ids2(num_nodes, stream);
+  auto prev_parent_node_it = thrust::make_transform_iterator(
+    thrust::make_counting_iterator<size_type>(0),
+    [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
+      return (i > 0) ? ((tokens_gpu[i - 1] == token_t::StructBegin ||
+                         tokens_gpu[i - 1] == token_t::ListBegin)
+                          ? i - 1
+                          : ((tokens_gpu[i - 1] == token_t::FieldNameEnd ||
+                              (tokens_gpu[i - 1] == token_t::StructMemberBegin &&
+                               (tokens_gpu[i - 2] == token_t::StructBegin ||
+                                tokens_gpu[i - 2] == token_t::ListBegin)))
+                               ? i - 2
+                               : -1))
+                     : -1;
+      // -1, not sentinel used here because of max operation below
+    });
+  thrust::copy_if(rmm::exec_policy(stream),
+                  prev_parent_node_it,
+                  prev_parent_node_it + num_tokens,
+                  tokens.begin(),
+                  parent_token_ids2.begin(),
+                  is_node);
+  rmm::device_uvector<size_type> initial_order2(num_nodes, stream);
+  thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end());
+  auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data());
   // Uses radix sort for builtin types.
   thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             token_levels.data(),
-                             token_levels.data() + token_levels.size(),
-                             out_pid);
+                             node_levels.data(),
+                             node_levels.data() + node_levels.size(),
+                             out_pid2);
 
-  // SegmentedScan Max.
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                token_levels.data(),
-                                token_levels.data() + token_levels.size(),
-                                parent_token_ids.data(),
-                                parent_token_ids.data(),
+                                node_levels.data(),
+                                node_levels.data() + node_levels.size(),
+                                parent_token_ids2.data(),
+                                parent_token_ids2.data(),
                                 thrust::equal_to<size_type>{},
                                 thrust::maximum<size_type>{});
+  // thrust::sort_by_key(rmm::exec_policy(stream),
+  //                            initial_order2.data(),
+  //                            initial_order2.data() + initial_order2.size(),
+  //                            thrust::make_zip_iterator(parent_token_ids2.data(),
+  //                            node_levels.data()));
 
   // scatter to restore the original order.
   {
-    rmm::device_uvector<size_type> temp_storage(num_tokens, stream);
+    CUDF_PUSH_RANGE("scatter");
+    rmm::device_uvector<size_type> temp_storage(num_nodes, stream);
     thrust::scatter(rmm::exec_policy(stream),
-                    parent_token_ids.begin(),
-                    parent_token_ids.end(),
-                    initial_order.begin(),
+                    parent_token_ids2.begin(),
+                    parent_token_ids2.end(),
+                    initial_order2.begin(),
                     temp_storage.begin());
+    thrust::copy(rmm::exec_policy(stream),
+                 temp_storage.begin(),
+                 temp_storage.end(),
+                 parent_token_ids2.begin());
+    rmm::device_uvector<TreeDepthT> temp_storage2(num_nodes, stream);
+    thrust::scatter(rmm::exec_policy(stream),
+                    node_levels.begin(),
+                    node_levels.end(),
+                    initial_order2.begin(),
+                    temp_storage2.begin());
     thrust::copy(
-      rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin());
+      rmm::exec_policy(stream), temp_storage2.begin(), temp_storage2.end(), node_levels.begin());
+    CUDF_POP_RANGE();
   }
+  CUDF_POP_RANGE();  // parent_token_ids
 
+  CUDF_PUSH_RANGE("node_ids");
+  // use copy_if counting_it and do lower_bound. which is faster?
   rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
   thrust::exclusive_scan(
     rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
+  CUDF_POP_RANGE();
 
-  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
-  auto parent_node_ids_it = thrust::make_transform_iterator(
-    parent_token_ids.begin(),
+  CUDF_PUSH_RANGE("parent_node_ids");
+  thrust::transform(
+    rmm::exec_policy(stream),
+    parent_token_ids2.begin(),
+    parent_token_ids2.end(),
+    parent_token_ids2.begin(),
     [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
       return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
     });
-  auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream),
-                                             parent_node_ids_it,
-                                             parent_node_ids_it + parent_token_ids.size(),
-                                             tokens.begin(),
-                                             parent_node_ids.begin(),
-                                             is_node);
-  CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes,
-               "parent node id gather mismatch");
+  CUDF_POP_RANGE();
 
   return {std::move(node_categories),
-          std::move(parent_node_ids),
+          std::move(parent_token_ids2),
           std::move(node_levels),
           std::move(node_range_begin),
           std::move(node_range_end)};

From efb662193186b03f4fe553b8afb27b8a6ce4cd2a Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 6 Oct 2022 17:25:55 +0530
Subject: [PATCH 03/19] reduce node_ids memory (not impacting peak memory)

---
 cpp/src/io/json/json_tree.cu | 72 ++++++++++++++++++++++++++++--------
 1 file changed, 57 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 97320c70c30..6a5fc3177a9 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -86,7 +86,6 @@ auto to_cat = [](auto v) -> std::string {
 };
 auto to_int    = [](auto v) { return std::to_string(static_cast<int>(v)); };
 auto print_vec = [](auto const& gpu, auto const name, auto converter) {
-  return;
   auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value);
   for (auto const& v : cpu)
     printf("%3s,", converter(v).c_str());
@@ -207,9 +206,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 
   CUDF_PUSH_RANGE("num_nodes");
   auto num_tokens = tokens.size();
-  auto is_node_it = thrust::make_transform_iterator(
-    tokens.begin(),
-    [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
+  // auto is_node_it = thrust::make_transform_iterator(
+  //   tokens.begin(),
+  //   [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
   auto num_nodes = thrust::count_if(
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
   CUDF_POP_RANGE();
@@ -301,21 +300,30 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                      : -1;
       // -1, not sentinel used here because of max operation below
     });
+  CUDF_PUSH_RANGE("copy_if");
   thrust::copy_if(rmm::exec_policy(stream),
                   prev_parent_node_it,
                   prev_parent_node_it + num_tokens,
                   tokens.begin(),
                   parent_token_ids2.begin(),
                   is_node);
+  CUDF_POP_RANGE();
+
+  CUDF_PUSH_RANGE("seq");
   rmm::device_uvector<size_type> initial_order2(num_nodes, stream);
   thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end());
+  CUDF_POP_RANGE();
+
+  CUDF_PUSH_RANGE("stable_sort");
   auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data());
   // Uses radix sort for builtin types.
   thrust::stable_sort_by_key(rmm::exec_policy(stream),
                              node_levels.data(),
                              node_levels.data() + node_levels.size(),
                              out_pid2);
+  CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("scan");
   thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                 node_levels.data(),
                                 node_levels.data() + node_levels.size(),
@@ -323,6 +331,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                 parent_token_ids2.data(),
                                 thrust::equal_to<size_type>{},
                                 thrust::maximum<size_type>{});
+  CUDF_POP_RANGE();
   // thrust::sort_by_key(rmm::exec_policy(stream),
   //                            initial_order2.data(),
   //                            initial_order2.data() + initial_order2.size(),
@@ -356,20 +365,53 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 
   CUDF_PUSH_RANGE("node_ids");
   // use copy_if counting_it and do lower_bound. which is faster?
-  rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
+  // rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
+  // thrust::exclusive_scan(
+  //   rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
+
+  rmm::device_uvector<size_type> node_ids_gpu3(num_nodes,
+                                               stream);  // TODO reuse initial_order memory.
+  thrust::copy_if(rmm::exec_policy(stream),
+                  thrust::make_counting_iterator<size_type>(0),
+                  thrust::make_counting_iterator<size_type>(0) + num_tokens,
+                  tokens.begin(),
+                  node_ids_gpu3.begin(),
+                  is_node);
+
   CUDF_POP_RANGE();
+  // rmm::device_uvector<size_type> node_ids_gpu2(num_tokens, stream);
+  // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end());
+  // print_vec(node_ids_gpu2, "token_id_gpu", to_int);
+  // print_vec(tokens, "tokens", to_token_str);
+  // print_vec(node_ids_gpu, "node_ids_gpu", to_int);
+  // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
+  // print_vec(node_categories, "node_categories", to_cat);
+  // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int);
 
   CUDF_PUSH_RANGE("parent_node_ids");
-  thrust::transform(
-    rmm::exec_policy(stream),
-    parent_token_ids2.begin(),
-    parent_token_ids2.end(),
-    parent_token_ids2.begin(),
-    [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
-      return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
-    });
+  // rmm::device_uvector<size_type> parent_node_ids2(num_nodes, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    parent_token_ids2.begin(),
+                    parent_token_ids2.end(),
+                    parent_token_ids2.begin(),
+                    [node_ids_gpu = node_ids_gpu3.begin(),
+                     num_nodes] __device__(size_type const pid) -> NodeIndexT {
+                      return pid < 0 ? parent_node_sentinel
+                                     : thrust::lower_bound(
+                                         thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                                         node_ids_gpu;
+                    });
+  // print_vec(parent_node_ids2, "parent_node_ids2", to_int);
+
+  // thrust::transform(
+  //   rmm::exec_policy(stream),
+  //   parent_token_ids2.begin(),
+  //   parent_token_ids2.end(),
+  //   parent_token_ids2.begin(),
+  //   [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
+  //     return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
+  //   });
+  // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
   CUDF_POP_RANGE();
 
   return {std::move(node_categories),

From 5f250cb1bd7c07ba3d668dfea6e256b4090f963b Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 6 Oct 2022 17:38:01 +0530
Subject: [PATCH 04/19] reorder node_range, node_cat, scope limit token_levels

reduce peak memory usage (not total memory used)
reorder node_range, node_cat, scope limit token_levels
10.957 GiB -> 9.91 GiB -> 9.774 GiB -> 9.403 GiB
---
 cpp/src/io/json/json_tree.cu | 110 ++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 54 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 6a5fc3177a9..8c980de8774 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -213,63 +213,28 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
   CUDF_POP_RANGE();
 
-  CUDF_PUSH_RANGE("node_categories");
-  // Node categories: copy_if with transform.
-  rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
-  auto node_categories_it =
-    thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{});
-  auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream),
-                                             tokens.begin(),
-                                             tokens.begin() + num_tokens,
-                                             node_categories_it,
-                                             is_node);
-  CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
-               "node category count mismatch");
-  CUDF_POP_RANGE();
-
   CUDF_PUSH_RANGE("token_levels");
-  // Node levels: transform_exclusive_scan, copy_if.
-  rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
-  auto push_pop_it = thrust::make_transform_iterator(
-    tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
-      return does_push(token) - does_pop(token);
-    });
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
-  CUDF_POP_RANGE();
-
-  CUDF_PUSH_RANGE("node_levels");
   rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
-  auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
-                                         token_levels.begin(),
-                                         token_levels.begin() + num_tokens,
-                                         tokens.begin(),
-                                         node_levels.begin(),
-                                         is_node);
-  CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
-  CUDF_POP_RANGE();
-
-  CUDF_PUSH_RANGE("node_ranges");
-  // Node ranges: copy_if with transform.
-  rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
-  rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
-  auto node_range_tuple_it =
-    thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
-  // Whether the tokenizer stage should keep quote characters for string values
-  // If the tokenizer keeps the quote characters, they may be stripped during type casting
-  constexpr bool include_quote_char = true;
-  auto node_range_out_it            = thrust::make_transform_output_iterator(
-    node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char});
+  {
+    // Node levels: transform_exclusive_scan, copy_if.
+    rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
+    auto push_pop_it = thrust::make_transform_iterator(
+      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
+        return does_push(token) - does_pop(token);
+      });
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
+    CUDF_POP_RANGE();
 
-  auto node_range_out_end =
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(0) + num_tokens,
-                    node_range_out_it,
-                    [is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool {
-                      return is_node(tokens_gpu[i]);
-                    });
-  CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
+    CUDF_PUSH_RANGE("node_levels");
+    auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
+                                           token_levels.begin(),
+                                           token_levels.begin() + num_tokens,
+                                           tokens.begin(),
+                                           node_levels.begin(),
+                                           is_node);
+    CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
+  }
   CUDF_POP_RANGE();
 
   CUDF_PUSH_RANGE("parent_token_ids");
@@ -414,6 +379,43 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
   CUDF_POP_RANGE();
 
+  CUDF_PUSH_RANGE("node_categories");
+  // Node categories: copy_if with transform.
+  rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
+  auto node_categories_it =
+    thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{});
+  auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream),
+                                             tokens.begin(),
+                                             tokens.begin() + num_tokens,
+                                             node_categories_it,
+                                             is_node);
+  CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
+               "node category count mismatch");
+  CUDF_POP_RANGE();
+
+  CUDF_PUSH_RANGE("node_ranges");
+  // Node ranges: copy_if with transform.
+  rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
+  rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
+  auto node_range_tuple_it =
+    thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
+  // Whether the tokenizer stage should keep quote characters for string values
+  // If the tokenizer keeps the quote characters, they may be stripped during type casting
+  constexpr bool include_quote_char = true;
+  auto node_range_out_it            = thrust::make_transform_output_iterator(
+    node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char});
+
+  auto node_range_out_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(0) + num_tokens,
+                    node_range_out_it,
+                    [is_node, tokens_gpu = tokens.begin()] __device__(size_type i) -> bool {
+                      return is_node(tokens_gpu[i]);
+                    });
+  CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
+  CUDF_POP_RANGE();
+
   return {std::move(node_categories),
           std::move(parent_token_ids2),
           std::move(node_levels),

From 49cb0d7ee7478e233c5eb64dc53087242bb53242 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 00:51:15 +0530
Subject: [PATCH 05/19] use cub SortPairs to reduce memory 9.403 GiB to 8.487
 GiB (for 1GB json input)

---
 cpp/src/io/json/json_tree.cu | 108 ++++++++++++++++++++---------------
 1 file changed, 61 insertions(+), 47 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 8c980de8774..78914264020 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -29,6 +29,8 @@
 
 #include <cuco/static_map.cuh>
 
+#include <cub/device/device_radix_sort.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -274,58 +276,70 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                   is_node);
   CUDF_POP_RANGE();
 
-  CUDF_PUSH_RANGE("seq");
-  rmm::device_uvector<size_type> initial_order2(num_nodes, stream);
-  thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end());
-  CUDF_POP_RANGE();
-
-  CUDF_PUSH_RANGE("stable_sort");
-  auto out_pid2 = thrust::make_zip_iterator(parent_token_ids2.data(), initial_order2.data());
-  // Uses radix sort for builtin types.
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             node_levels.data(),
-                             node_levels.data() + node_levels.size(),
-                             out_pid2);
-  CUDF_POP_RANGE();
-
-  CUDF_PUSH_RANGE("scan");
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                node_levels.data(),
-                                node_levels.data() + node_levels.size(),
-                                parent_token_ids2.data(),
-                                parent_token_ids2.data(),
-                                thrust::equal_to<size_type>{},
-                                thrust::maximum<size_type>{});
-  CUDF_POP_RANGE();
-  // thrust::sort_by_key(rmm::exec_policy(stream),
-  //                            initial_order2.data(),
-  //                            initial_order2.data() + initial_order2.size(),
-  //                            thrust::make_zip_iterator(parent_token_ids2.data(),
-  //                            node_levels.data()));
-
-  // scatter to restore the original order.
   {
+    auto [node_levels1, initial_order1] = [&]() {
+      // Uses stable radix sort for builtin types.
+      CUDF_PUSH_RANGE("cub-sort");
+      CUDF_PUSH_RANGE("seq");
+      rmm::device_uvector<size_type> initial_order2(num_nodes, stream);
+      thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end());
+      CUDF_POP_RANGE();
+
+      // Determine temporary device storage requirements
+      size_t temp_storage_bytes = 0;
+      CUDF_PUSH_RANGE("cub-out");
+      rmm::device_uvector<TreeDepthT> node_levels1(num_nodes, stream, mr);
+      rmm::device_uvector<size_type> initial_order1(num_nodes, stream);
+      cub::DeviceRadixSort::SortPairs(nullptr,
+                                      temp_storage_bytes,
+                                      node_levels.data(),
+                                      node_levels1.data(),
+                                      initial_order2.data(),
+                                      initial_order1.data(),
+                                      num_nodes);
+      CUDF_PUSH_RANGE("cub-temp");
+      rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+      cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                      temp_storage_bytes,
+                                      node_levels.data(),
+                                      node_levels1.data(),
+                                      initial_order2.data(),
+                                      initial_order1.data(),
+                                      num_nodes);
+      CUDF_POP_RANGE();
+      CUDF_POP_RANGE();
+      CUDF_POP_RANGE();
+      return std::pair{std::move(node_levels1), std::move(initial_order1)};
+    }();
+    // gather. additional memory!!!?
+    CUDF_PUSH_RANGE("scan");
+    rmm::device_uvector<size_type> parent_token_ids1(num_nodes, stream);
+    thrust::gather(rmm::exec_policy(stream),
+                   initial_order1.begin(),
+                   initial_order1.end(),
+                   parent_token_ids2.begin(),
+                   parent_token_ids1.begin());
+    CUDF_POP_RANGE();
+
+    CUDF_PUSH_RANGE("scan");
+    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                  node_levels1.data(),
+                                  node_levels1.data() + num_nodes,
+                                  parent_token_ids1.data(),
+                                  parent_token_ids1.data(),
+                                  thrust::equal_to<TreeDepthT>{},
+                                  thrust::maximum<size_type>{});
+    CUDF_POP_RANGE();
     CUDF_PUSH_RANGE("scatter");
-    rmm::device_uvector<size_type> temp_storage(num_nodes, stream);
+    // scatter to restore the original order.
     thrust::scatter(rmm::exec_policy(stream),
-                    parent_token_ids2.begin(),
-                    parent_token_ids2.end(),
-                    initial_order2.begin(),
-                    temp_storage.begin());
-    thrust::copy(rmm::exec_policy(stream),
-                 temp_storage.begin(),
-                 temp_storage.end(),
-                 parent_token_ids2.begin());
-    rmm::device_uvector<TreeDepthT> temp_storage2(num_nodes, stream);
-    thrust::scatter(rmm::exec_policy(stream),
-                    node_levels.begin(),
-                    node_levels.end(),
-                    initial_order2.begin(),
-                    temp_storage2.begin());
-    thrust::copy(
-      rmm::exec_policy(stream), temp_storage2.begin(), temp_storage2.end(), node_levels.begin());
+                    parent_token_ids1.begin(),
+                    parent_token_ids1.end(),
+                    initial_order1.begin(),
+                    parent_token_ids2.begin());
     CUDF_POP_RANGE();
   }
+
   CUDF_POP_RANGE();  // parent_token_ids
 
   CUDF_PUSH_RANGE("node_ids");

From 02a7b5bbc6d6dd4a5fb102d6bcb36972ec0e45b1 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 02:21:11 +0530
Subject: [PATCH 06/19] reduce memory by cub::DoubleBuffer, scope limit
 token_id_for_nodes

---
 cpp/src/io/json/json_tree.cu | 133 ++++++++++++++++++-----------------
 1 file changed, 67 insertions(+), 66 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 78914264020..ff07380be37 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -288,31 +288,28 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       // Determine temporary device storage requirements
       size_t temp_storage_bytes = 0;
       CUDF_PUSH_RANGE("cub-out");
-      rmm::device_uvector<TreeDepthT> node_levels1(num_nodes, stream, mr);
+      rmm::device_uvector<TreeDepthT> node_levels2(node_levels, stream);
+      rmm::device_uvector<TreeDepthT> node_levels1(num_nodes, stream);
       rmm::device_uvector<size_type> initial_order1(num_nodes, stream);
-      cub::DeviceRadixSort::SortPairs(nullptr,
-                                      temp_storage_bytes,
-                                      node_levels.data(),
-                                      node_levels1.data(),
-                                      initial_order2.data(),
-                                      initial_order1.data(),
-                                      num_nodes);
+      cub::DoubleBuffer<size_type> order(initial_order2.data(), initial_order1.data());
+      cub::DoubleBuffer<TreeDepthT> levels(node_levels2.data(), node_levels1.data());
+      cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, levels, order, num_nodes);
       CUDF_PUSH_RANGE("cub-temp");
+      // std::cout<<"temp_storage_bytes: "<<temp_storage_bytes<<std::endl;
       rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-      cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
-                                      temp_storage_bytes,
-                                      node_levels.data(),
-                                      node_levels1.data(),
-                                      initial_order2.data(),
-                                      initial_order1.data(),
-                                      num_nodes);
+      cub::DeviceRadixSort::SortPairs(
+        d_temp_storage.data(), temp_storage_bytes, levels, order, num_nodes);
       CUDF_POP_RANGE();
       CUDF_POP_RANGE();
       CUDF_POP_RANGE();
-      return std::pair{std::move(node_levels1), std::move(initial_order1)};
+      return std::pair{
+        levels.Current() == node_levels1.data() ? std::move(node_levels1) : std::move(node_levels2),
+        order.Current() == initial_order1.data() ? std::move(initial_order1)
+                                                 : std::move(initial_order2)};
     }();
     // gather. additional memory!!!?
     CUDF_PUSH_RANGE("scan");
+    CUDF_PUSH_RANGE("gather");
     rmm::device_uvector<size_type> parent_token_ids1(num_nodes, stream);
     thrust::gather(rmm::exec_policy(stream),
                    initial_order1.begin(),
@@ -321,7 +318,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                    parent_token_ids1.begin());
     CUDF_POP_RANGE();
 
-    CUDF_PUSH_RANGE("scan");
+    CUDF_PUSH_RANGE("inscan");
     thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                   node_levels1.data(),
                                   node_levels1.data() + num_nodes,
@@ -338,60 +335,64 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                     initial_order1.begin(),
                     parent_token_ids2.begin());
     CUDF_POP_RANGE();
+    CUDF_POP_RANGE();
   }
 
   CUDF_POP_RANGE();  // parent_token_ids
 
-  CUDF_PUSH_RANGE("node_ids");
-  // use copy_if counting_it and do lower_bound. which is faster?
-  // rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
-  // thrust::exclusive_scan(
-  //   rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
-
-  rmm::device_uvector<size_type> node_ids_gpu3(num_nodes,
-                                               stream);  // TODO reuse initial_order memory.
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(0),
-                  thrust::make_counting_iterator<size_type>(0) + num_tokens,
-                  tokens.begin(),
-                  node_ids_gpu3.begin(),
-                  is_node);
+  {
+    CUDF_PUSH_RANGE("node_ids");
+    // use copy_if counting_it and do lower_bound. which is faster?
+    // rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
+    // thrust::exclusive_scan(
+    //   rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
+
+    rmm::device_uvector<size_type> node_ids_gpu3(num_nodes,
+                                                 stream);  // TODO reuse initial_order memory.
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(0) + num_tokens,
+                    tokens.begin(),
+                    node_ids_gpu3.begin(),
+                    is_node);
 
-  CUDF_POP_RANGE();
-  // rmm::device_uvector<size_type> node_ids_gpu2(num_tokens, stream);
-  // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end());
-  // print_vec(node_ids_gpu2, "token_id_gpu", to_int);
-  // print_vec(tokens, "tokens", to_token_str);
-  // print_vec(node_ids_gpu, "node_ids_gpu", to_int);
-  // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
-  // print_vec(node_categories, "node_categories", to_cat);
-  // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int);
-
-  CUDF_PUSH_RANGE("parent_node_ids");
-  // rmm::device_uvector<size_type> parent_node_ids2(num_nodes, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    parent_token_ids2.begin(),
-                    parent_token_ids2.end(),
-                    parent_token_ids2.begin(),
-                    [node_ids_gpu = node_ids_gpu3.begin(),
-                     num_nodes] __device__(size_type const pid) -> NodeIndexT {
-                      return pid < 0 ? parent_node_sentinel
-                                     : thrust::lower_bound(
-                                         thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
-                                         node_ids_gpu;
-                    });
-  // print_vec(parent_node_ids2, "parent_node_ids2", to_int);
-
-  // thrust::transform(
-  //   rmm::exec_policy(stream),
-  //   parent_token_ids2.begin(),
-  //   parent_token_ids2.end(),
-  //   parent_token_ids2.begin(),
-  //   [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
-  //     return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
-  //   });
-  // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
-  CUDF_POP_RANGE();
+    CUDF_POP_RANGE();
+    // rmm::device_uvector<size_type> node_ids_gpu2(num_tokens, stream);
+    // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end());
+    // print_vec(node_ids_gpu2, "token_id_gpu", to_int);
+    // print_vec(tokens, "tokens", to_token_str);
+    // print_vec(node_ids_gpu, "node_ids_gpu", to_int);
+    // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
+    // print_vec(node_categories, "node_categories", to_cat);
+    // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int);
+
+    CUDF_PUSH_RANGE("parent_node_ids");
+    // rmm::device_uvector<size_type> parent_node_ids2(num_nodes, stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      parent_token_ids2.begin(),
+                      parent_token_ids2.end(),
+                      parent_token_ids2.begin(),
+                      [node_ids_gpu = node_ids_gpu3.begin(),
+                       num_nodes] __device__(size_type const pid) -> NodeIndexT {
+                        return pid < 0
+                                 ? parent_node_sentinel
+                                 : thrust::lower_bound(
+                                     thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                                     node_ids_gpu;
+                      });
+    // print_vec(parent_node_ids2, "parent_node_ids2", to_int);
+
+    // thrust::transform(
+    //   rmm::exec_policy(stream),
+    //   parent_token_ids2.begin(),
+    //   parent_token_ids2.end(),
+    //   parent_token_ids2.begin(),
+    //   [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
+    //     return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
+    //   });
+    // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
+    CUDF_POP_RANGE();
+  }
 
   CUDF_PUSH_RANGE("node_categories");
   // Node categories: copy_if with transform.

From 9243d893b52fdda12443ef24c63f487e5d4aad96 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 14:06:08 +0530
Subject: [PATCH 07/19] cleanup

---
 cpp/src/io/json/json_tree.cu | 49 ++++++++----------------------------
 1 file changed, 10 insertions(+), 39 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ff07380be37..43f7cf6e81b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -208,10 +208,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 
   CUDF_PUSH_RANGE("num_nodes");
   auto num_tokens = tokens.size();
-  // auto is_node_it = thrust::make_transform_iterator(
-  //   tokens.begin(),
-  //   [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
-  auto num_nodes = thrust::count_if(
+  auto num_nodes  = thrust::count_if(
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
   CUDF_POP_RANGE();
 
@@ -240,17 +237,16 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   CUDF_POP_RANGE();
 
   CUDF_PUSH_RANGE("parent_token_ids");
-  // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
-  // reorder, copy_if. This one is sort of logical stack. But more generalized.
+  // Node parent ids:
+  // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
+  // This one is sort of logical stack. But more generalized.
   // TODO: make it own function.
 
-  // TODO re-write the algorithm to work only on nodes, not tokens.
-  // // ### only push nodes matter for scan! (verify throughly if true. For i-1 == FE, then i-3
-  // matters. or i-2 because FB and SMB treated same. make sure nodeid matches right for FB/SMB.
-  // // now copy only push operations to seperate array with token_levels, & node_id.
-  // // sort by level, then scan it. then scatter to node_id positions.
-  // // then another scan for non-push nodes? or another scan before scatter? L/S SMB FE.
-  // // total memory: num_nodes*(4b+b+4b) <= 9b*num_nodes.
+  // previous push node_id
+  // if previous node is a push, then i-1
+  // if previous node is FE, then i-2
+  // if previous node is SMB and its previous node is a push, then i-2
+  // else -1
   rmm::device_uvector<size_type> parent_token_ids2(num_nodes, stream);
   auto prev_parent_node_it = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0),
@@ -342,13 +338,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 
   {
     CUDF_PUSH_RANGE("node_ids");
-    // use copy_if counting_it and do lower_bound. which is faster?
-    // rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
-    // thrust::exclusive_scan(
-    //   rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
-
-    rmm::device_uvector<size_type> node_ids_gpu3(num_nodes,
-                                                 stream);  // TODO reuse initial_order memory.
+    rmm::device_uvector<size_type> node_ids_gpu3(num_nodes, stream);
     thrust::copy_if(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(0) + num_tokens,
@@ -357,17 +347,8 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                     is_node);
 
     CUDF_POP_RANGE();
-    // rmm::device_uvector<size_type> node_ids_gpu2(num_tokens, stream);
-    // thrust::sequence(rmm::exec_policy(stream), node_ids_gpu2.begin(), node_ids_gpu2.end());
-    // print_vec(node_ids_gpu2, "token_id_gpu", to_int);
-    // print_vec(tokens, "tokens", to_token_str);
-    // print_vec(node_ids_gpu, "node_ids_gpu", to_int);
-    // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
-    // print_vec(node_categories, "node_categories", to_cat);
-    // print_vec(node_ids_gpu3, "node_ids_gpu3", to_int);
 
     CUDF_PUSH_RANGE("parent_node_ids");
-    // rmm::device_uvector<size_type> parent_node_ids2(num_nodes, stream);
     thrust::transform(rmm::exec_policy(stream),
                       parent_token_ids2.begin(),
                       parent_token_ids2.end(),
@@ -381,16 +362,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                      node_ids_gpu;
                       });
     // print_vec(parent_node_ids2, "parent_node_ids2", to_int);
-
-    // thrust::transform(
-    //   rmm::exec_policy(stream),
-    //   parent_token_ids2.begin(),
-    //   parent_token_ids2.end(),
-    //   parent_token_ids2.begin(),
-    //   [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
-    //     return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
-    //   });
-    // print_vec(parent_token_ids2, "parent_token_ids2", to_int);
     CUDF_POP_RANGE();
   }
 

From 7efc890555061cad398154f54b2e563eafcd9e43 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 16:24:45 +0530
Subject: [PATCH 08/19] reorganize parent_node_ids algorithm (generic logical
 stack)

---
 cpp/src/io/json/json_tree.cu | 235 ++++++++++++++++++-----------------
 1 file changed, 120 insertions(+), 115 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 43f7cf6e81b..428a295706a 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -41,6 +41,7 @@
 #include <thrust/fill.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
@@ -166,6 +167,74 @@ struct node_ranges {
   }
 };
 
+/**
+ * @brief Returns stable sorted key and its sorted order
+ *
+ * @tparam IndexType sorted order type
+ * @tparam KeyType key type
+ * @param key key to sort
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return A pair of sorted key and its sorted order
+ */
+template <typename IndexType = size_t, typename KeyType>
+std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
+  cudf::device_span<KeyType const> key, rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  // Uses stable radix sort for builtin types.
+
+  // Determine temporary device storage requirements
+  rmm::device_uvector<TreeDepthT> key1(key.size(), stream);
+  rmm::device_uvector<TreeDepthT> key2(key.size(), stream);
+  rmm::device_uvector<IndexType> order1(key.size(), stream);
+  rmm::device_uvector<IndexType> order2(key.size(), stream);
+  cub::DoubleBuffer<IndexType> order(order1.data(), order2.data());
+  cub::DoubleBuffer<TreeDepthT> key_buffer(key1.data(), key2.data());
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size());
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+  CUDF_PUSH_RANGE("copy");
+  thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin());
+  CUDF_POP_RANGE();
+  CUDF_PUSH_RANGE("seq");
+  thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end());
+  CUDF_POP_RANGE();
+  CUDF_PUSH_RANGE("cub-sort");
+  cub::DeviceRadixSort::SortPairs(
+    d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size());
+  CUDF_POP_RANGE();
+  return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2),
+                   order.Current() == order1.data() ? std::move(order1) : std::move(order2)};
+}
+
+/**
+ * @brief Propagate parent node to siblings from first sibling.
+ *
+ * @param node_levels Node levels of each node
+ * @param parent_node_ids parent node ids initialized for first child of each push node,
+ *                       and other siblings are initialized to -1.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
+                                  cudf::device_span<NodeIndexT> parent_node_ids,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
+  // instead of gather, using permutation_iterator, which is ~17% faster
+
+  CUDF_PUSH_RANGE("inscan");
+  thrust::inclusive_scan_by_key(
+    rmm::exec_policy(stream),
+    sorted_node_levels.begin(),
+    sorted_node_levels.end(),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::equal_to<TreeDepthT>{},
+    thrust::maximum<NodeIndexT>{});
+  CUDF_POP_RANGE();
+}
+
 // Generates a tree representation of the given tokens, token_indices.
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
@@ -218,7 +287,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     // Node levels: transform_exclusive_scan, copy_if.
     rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
     auto push_pop_it = thrust::make_transform_iterator(
-      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
+      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> TreeDepthT {
         return does_push(token) - does_pop(token);
       });
     thrust::exclusive_scan(
@@ -242,128 +311,64 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   // This one is sort of logical stack. But more generalized.
   // TODO: make it own function.
 
-  // previous push node_id
-  // if previous node is a push, then i-1
-  // if previous node is FE, then i-2
-  // if previous node is SMB and its previous node is a push, then i-2
-  // else -1
-  rmm::device_uvector<size_type> parent_token_ids2(num_nodes, stream);
-  auto prev_parent_node_it = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
-      return (i > 0) ? ((tokens_gpu[i - 1] == token_t::StructBegin ||
-                         tokens_gpu[i - 1] == token_t::ListBegin)
-                          ? i - 1
-                          : ((tokens_gpu[i - 1] == token_t::FieldNameEnd ||
-                              (tokens_gpu[i - 1] == token_t::StructMemberBegin &&
-                               (tokens_gpu[i - 2] == token_t::StructBegin ||
-                                tokens_gpu[i - 2] == token_t::ListBegin)))
-                               ? i - 2
-                               : -1))
-                     : -1;
-      // -1, not sentinel used here because of max operation below
-    });
-  CUDF_PUSH_RANGE("copy_if");
-  thrust::copy_if(rmm::exec_policy(stream),
-                  prev_parent_node_it,
-                  prev_parent_node_it + num_tokens,
-                  tokens.begin(),
-                  parent_token_ids2.begin(),
-                  is_node);
-  CUDF_POP_RANGE();
-
-  {
-    auto [node_levels1, initial_order1] = [&]() {
-      // Uses stable radix sort for builtin types.
-      CUDF_PUSH_RANGE("cub-sort");
-      CUDF_PUSH_RANGE("seq");
-      rmm::device_uvector<size_type> initial_order2(num_nodes, stream);
-      thrust::sequence(rmm::exec_policy(stream), initial_order2.begin(), initial_order2.end());
-      CUDF_POP_RANGE();
-
-      // Determine temporary device storage requirements
-      size_t temp_storage_bytes = 0;
-      CUDF_PUSH_RANGE("cub-out");
-      rmm::device_uvector<TreeDepthT> node_levels2(node_levels, stream);
-      rmm::device_uvector<TreeDepthT> node_levels1(num_nodes, stream);
-      rmm::device_uvector<size_type> initial_order1(num_nodes, stream);
-      cub::DoubleBuffer<size_type> order(initial_order2.data(), initial_order1.data());
-      cub::DoubleBuffer<TreeDepthT> levels(node_levels2.data(), node_levels1.data());
-      cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, levels, order, num_nodes);
-      CUDF_PUSH_RANGE("cub-temp");
-      // std::cout<<"temp_storage_bytes: "<<temp_storage_bytes<<std::endl;
-      rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-      cub::DeviceRadixSort::SortPairs(
-        d_temp_storage.data(), temp_storage_bytes, levels, order, num_nodes);
-      CUDF_POP_RANGE();
-      CUDF_POP_RANGE();
-      CUDF_POP_RANGE();
-      return std::pair{
-        levels.Current() == node_levels1.data() ? std::move(node_levels1) : std::move(node_levels2),
-        order.Current() == initial_order1.data() ? std::move(initial_order1)
-                                                 : std::move(initial_order2)};
-    }();
-    // gather. additional memory!!!?
-    CUDF_PUSH_RANGE("scan");
-    CUDF_PUSH_RANGE("gather");
-    rmm::device_uvector<size_type> parent_token_ids1(num_nodes, stream);
-    thrust::gather(rmm::exec_policy(stream),
-                   initial_order1.begin(),
-                   initial_order1.end(),
-                   parent_token_ids2.begin(),
-                   parent_token_ids1.begin());
-    CUDF_POP_RANGE();
-
-    CUDF_PUSH_RANGE("inscan");
-    thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                  node_levels1.data(),
-                                  node_levels1.data() + num_nodes,
-                                  parent_token_ids1.data(),
-                                  parent_token_ids1.data(),
-                                  thrust::equal_to<TreeDepthT>{},
-                                  thrust::maximum<size_type>{});
-    CUDF_POP_RANGE();
-    CUDF_PUSH_RANGE("scatter");
-    // scatter to restore the original order.
-    thrust::scatter(rmm::exec_policy(stream),
-                    parent_token_ids1.begin(),
-                    parent_token_ids1.end(),
-                    initial_order1.begin(),
-                    parent_token_ids2.begin());
-    CUDF_POP_RANGE();
-    CUDF_POP_RANGE();
-  }
-
-  CUDF_POP_RANGE();  // parent_token_ids
-
+  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream);
   {
-    CUDF_PUSH_RANGE("node_ids");
-    rmm::device_uvector<size_type> node_ids_gpu3(num_nodes, stream);
+    rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
+    CUDF_PUSH_RANGE("copy_if");
     thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(0) + num_tokens,
+                    thrust::make_counting_iterator<NodeIndexT>(0),
+                    thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
                     tokens.begin(),
-                    node_ids_gpu3.begin(),
+                    node_token_ids.begin(),
                     is_node);
-
     CUDF_POP_RANGE();
 
+    // previous push node_id
+    // if previous node is a push, then i-1
+    // if previous node is FE, then i-2
+    // if previous node is SMB and its previous node is a push, then i-2
+    // else -1
+    auto first_childs_parent_token_id = [tokens_gpu =
+                                           tokens.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) return -1;
+      if (tokens_gpu[i - 1] == token_t::StructBegin || tokens_gpu[i - 1] == token_t::ListBegin)
+        return i - 1;
+      else if (tokens_gpu[i - 1] == token_t::FieldNameEnd)
+        return i - 2;
+      else if (tokens_gpu[i - 1] == token_t::StructMemberBegin &&
+               (tokens_gpu[i - 2] == token_t::StructBegin ||
+                tokens_gpu[i - 2] == token_t::ListBegin))
+        return i - 2;
+      else
+        return -1;
+      // -1 is not sentinel, is required because of max operation below
+    };
+
     CUDF_PUSH_RANGE("parent_node_ids");
-    thrust::transform(rmm::exec_policy(stream),
-                      parent_token_ids2.begin(),
-                      parent_token_ids2.end(),
-                      parent_token_ids2.begin(),
-                      [node_ids_gpu = node_ids_gpu3.begin(),
-                       num_nodes] __device__(size_type const pid) -> NodeIndexT {
-                        return pid < 0
-                                 ? parent_node_sentinel
-                                 : thrust::lower_bound(
-                                     thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
-                                     node_ids_gpu;
-                      });
-    // print_vec(parent_node_ids2, "parent_node_ids2", to_int);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      node_token_ids.begin(),
+      node_token_ids.end(),
+      parent_node_ids.begin(),
+      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
+        NodeIndexT const tid) -> NodeIndexT {
+        auto pid = first_childs_parent_token_id(tid);
+        return pid < 0
+                 ? parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+        // TODO node id will be 1 or 2 nodes back. It could be faster than binary search.
+      });
+    // print_vec(parent_node_ids, "parent_node_ids", to_int);
     CUDF_POP_RANGE();
   }
+  // Propagate parent node to siblings from first sibling - inplace.
+  propagate_parent_to_siblings(
+    cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
+    parent_node_ids,
+    stream);
+
+  CUDF_POP_RANGE();  // parent_token_ids
 
   CUDF_PUSH_RANGE("node_categories");
   // Node categories: copy_if with transform.
@@ -403,7 +408,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   CUDF_POP_RANGE();
 
   return {std::move(node_categories),
-          std::move(parent_token_ids2),
+          std::move(parent_node_ids),
           std::move(node_levels),
           std::move(node_range_begin),
           std::move(node_range_end)};

From 6d3a1663fbbc1f33de6adf2e1f2801c3d8568f9d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 16:55:59 +0530
Subject: [PATCH 09/19] include CUDF_PUSH_RANGE, CUDF_POP_RANGE nvtx macros

---
 cpp/src/io/json/json_tree.cu    |  1 -
 cpp/src/io/json/nested_json.hpp | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 428a295706a..e9e6ae64bd0 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -357,7 +357,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                  ? parent_node_sentinel
                  : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
                      node_ids_gpu;
-        // TODO node id will be 1 or 2 nodes back. It could be faster than binary search.
       });
     // print_vec(parent_node_ids, "parent_node_ids", to_int);
     CUDF_POP_RANGE();
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 10d209b2ea6..90b679265fb 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -367,3 +367,17 @@ table_with_metadata host_parse_nested_json(
 }  // namespace detail
 
 }  // namespace cudf::io::json
+
+#define _CONCAT_(x, y) x##y
+#define CONCAT(x, y)   _CONCAT_(x, y)
+
+#define NVTX3_PUSH_RANGE_IN(D, tag)                                                            \
+  ::nvtx3::registered_message<D> const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \
+  ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__,                                   \
+                                         __LINE__){CONCAT(nvtx3_range_name__, __LINE__)};      \
+  nvtxDomainRangePushEx(::nvtx3::domain::get<D>(), CONCAT(nvtx3_range_attr__, __LINE__).get());
+
+#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get<D>());
+
+#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag)
+#define CUDF_POP_RANGE()     NVTX3_POP_RANGE(cudf::libcudf_domain)

From bbcbffa7ce5bd35395782db8c98e80cd8b46f5f0 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:04:21 +0530
Subject: [PATCH 10/19] replace TreeDepthT with size_type due to cuda Invalid
 Device function error

---
 cpp/src/io/json/json_tree.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index e9e6ae64bd0..0c06320a6dc 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -287,7 +287,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     // Node levels: transform_exclusive_scan, copy_if.
     rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
     auto push_pop_it = thrust::make_transform_iterator(
-      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> TreeDepthT {
+      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
         return does_push(token) - does_pop(token);
       });
     thrust::exclusive_scan(

From f9f0926c801d5708630d26558aa97428c5d2744c Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:19:01 +0530
Subject: [PATCH 11/19] update docs

---
 cpp/src/io/json/json_tree.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 0c06320a6dc..5a9b3ca18b3 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -170,6 +170,8 @@ struct node_ranges {
 /**
  * @brief Returns stable sorted key and its sorted order
  *
+ * Uses cub stable radix sort
+ *
  * @tparam IndexType sorted order type
  * @tparam KeyType key type
  * @param key key to sort
@@ -181,7 +183,6 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
   cudf::device_span<KeyType const> key, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  // Uses stable radix sort for builtin types.
 
   // Determine temporary device storage requirements
   rmm::device_uvector<TreeDepthT> key1(key.size(), stream);
@@ -308,7 +309,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   CUDF_PUSH_RANGE("parent_token_ids");
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
-  // This one is sort of logical stack. But more generalized.
+  // This algorithms si  is more generalized logical stack.
   // TODO: make it own function.
 
   rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream);
@@ -341,7 +342,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
         return i - 2;
       else
         return -1;
-      // -1 is not sentinel, is required because of max operation below
     };
 
     CUDF_PUSH_RANGE("parent_node_ids");
@@ -357,6 +357,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                  ? parent_node_sentinel
                  : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
                      node_ids_gpu;
+        // parent_node_sentinel is -1, useful for segmented max operation below
       });
     // print_vec(parent_node_ids, "parent_node_ids", to_int);
     CUDF_POP_RANGE();

From f851232bc3e7eecd81e71020da8f5c3e27b3c441 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:27:54 +0530
Subject: [PATCH 12/19] remove nvtx range macros and debug prints

---
 cpp/src/io/json/json_tree.cu | 75 +++---------------------------------
 1 file changed, 6 insertions(+), 69 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 5a9b3ca18b3..6a94ea05e81 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -56,45 +56,6 @@
 namespace cudf::io::json {
 namespace detail {
 
-// DEBUG print
-[[maybe_unused]] auto to_token_str = [](PdaTokenT token) -> std::string {
-  switch (token) {
-    case token_t::StructBegin: return " {";
-    case token_t::StructEnd: return " }";
-    case token_t::ListBegin: return " [";
-    case token_t::ListEnd: return " ]";
-    case token_t::FieldNameBegin: return "FB";
-    case token_t::FieldNameEnd: return "FE";
-    case token_t::StringBegin: return "SB";
-    case token_t::StringEnd: return "SE";
-    case token_t::ErrorBegin: return "er";
-    case token_t::ValueBegin: return "VB";
-    case token_t::ValueEnd: return "VE";
-    case token_t::StructMemberBegin: return " <";
-    case token_t::StructMemberEnd: return " >";
-    default: return ".";
-  }
-};
-// DEBUG prints
-auto to_cat = [](auto v) -> std::string {
-  switch (v) {
-    case NC_STRUCT: return " S";
-    case NC_LIST: return " L";
-    case NC_STR: return " \"";
-    case NC_VAL: return " V";
-    case NC_FN: return " F";
-    case NC_ERR: return "ER";
-    default: return "UN";
-  };
-};
-auto to_int    = [](auto v) { return std::to_string(static_cast<int>(v)); };
-auto print_vec = [](auto const& gpu, auto const name, auto converter) {
-  auto cpu = cudf::detail::make_std_vector_sync(gpu, cudf::default_stream_value);
-  for (auto const& v : cpu)
-    printf("%3s,", converter(v).c_str());
-  std::cout << name << std::endl;
-};
-
 // The node that a token represents
 struct token_to_node {
   __device__ auto operator()(PdaTokenT const token) -> NodeT
@@ -194,16 +155,13 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size());
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-  CUDF_PUSH_RANGE("copy");
+
   thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin());
-  CUDF_POP_RANGE();
-  CUDF_PUSH_RANGE("seq");
   thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end());
-  CUDF_POP_RANGE();
-  CUDF_PUSH_RANGE("cub-sort");
+
   cub::DeviceRadixSort::SortPairs(
     d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size());
-  CUDF_POP_RANGE();
+
   return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2),
                    order.Current() == order1.data() ? std::move(order1) : std::move(order2)};
 }
@@ -224,7 +182,6 @@ void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_level
   auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
   // instead of gather, using permutation_iterator, which is ~17% faster
 
-  CUDF_PUSH_RANGE("inscan");
   thrust::inclusive_scan_by_key(
     rmm::exec_policy(stream),
     sorted_node_levels.begin(),
@@ -233,7 +190,6 @@ void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_level
     thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
     thrust::equal_to<TreeDepthT>{},
     thrust::maximum<NodeIndexT>{});
-  CUDF_POP_RANGE();
 }
 
 // Generates a tree representation of the given tokens, token_indices.
@@ -276,16 +232,13 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     };
   };
 
-  CUDF_PUSH_RANGE("num_nodes");
   auto num_tokens = tokens.size();
   auto num_nodes  = thrust::count_if(
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
-  CUDF_POP_RANGE();
 
-  CUDF_PUSH_RANGE("token_levels");
+  // Node levels: transform_exclusive_scan, copy_if.
   rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
   {
-    // Node levels: transform_exclusive_scan, copy_if.
     rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
     auto push_pop_it = thrust::make_transform_iterator(
       tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
@@ -293,9 +246,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
       });
     thrust::exclusive_scan(
       rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
-    CUDF_POP_RANGE();
 
-    CUDF_PUSH_RANGE("node_levels");
     auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
                                            token_levels.begin(),
                                            token_levels.begin() + num_tokens,
@@ -304,25 +255,20 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                            is_node);
     CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
   }
-  CUDF_POP_RANGE();
 
-  CUDF_PUSH_RANGE("parent_token_ids");
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
-  // This algorithms si  is more generalized logical stack.
+  // This algorithms is a more generalized logical stack.
   // TODO: make it own function.
-
-  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream);
+  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
   {
     rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
-    CUDF_PUSH_RANGE("copy_if");
     thrust::copy_if(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<NodeIndexT>(0),
                     thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
                     tokens.begin(),
                     node_token_ids.begin(),
                     is_node);
-    CUDF_POP_RANGE();
 
     // previous push node_id
     // if previous node is a push, then i-1
@@ -344,7 +290,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
         return -1;
     };
 
-    CUDF_PUSH_RANGE("parent_node_ids");
     thrust::transform(
       rmm::exec_policy(stream),
       node_token_ids.begin(),
@@ -359,8 +304,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                      node_ids_gpu;
         // parent_node_sentinel is -1, useful for segmented max operation below
       });
-    // print_vec(parent_node_ids, "parent_node_ids", to_int);
-    CUDF_POP_RANGE();
   }
   // Propagate parent node to siblings from first sibling - inplace.
   propagate_parent_to_siblings(
@@ -368,9 +311,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     parent_node_ids,
     stream);
 
-  CUDF_POP_RANGE();  // parent_token_ids
-
-  CUDF_PUSH_RANGE("node_categories");
   // Node categories: copy_if with transform.
   rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
   auto node_categories_it =
@@ -382,9 +322,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                              is_node);
   CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
                "node category count mismatch");
-  CUDF_POP_RANGE();
 
-  CUDF_PUSH_RANGE("node_ranges");
   // Node ranges: copy_if with transform.
   rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
   rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
@@ -405,7 +343,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                       return is_node(tokens_gpu[i]);
                     });
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
-  CUDF_POP_RANGE();
 
   return {std::move(node_categories),
           std::move(parent_node_ids),

From 55369c9c5af7579617f9853aeec642c082789d63 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:46:56 +0530
Subject: [PATCH 13/19] remove nvtx macros

---
 cpp/src/io/json/nested_json.hpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 90b679265fb..10d209b2ea6 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -367,17 +367,3 @@ table_with_metadata host_parse_nested_json(
 }  // namespace detail
 
 }  // namespace cudf::io::json
-
-#define _CONCAT_(x, y) x##y
-#define CONCAT(x, y)   _CONCAT_(x, y)
-
-#define NVTX3_PUSH_RANGE_IN(D, tag)                                                            \
-  ::nvtx3::registered_message<D> const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \
-  ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__,                                   \
-                                         __LINE__){CONCAT(nvtx3_range_name__, __LINE__)};      \
-  nvtxDomainRangePushEx(::nvtx3::domain::get<D>(), CONCAT(nvtx3_range_attr__, __LINE__).get());
-
-#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get<D>());
-
-#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag)
-#define CUDF_POP_RANGE()     NVTX3_POP_RANGE(cudf::libcudf_domain)

From 5eefd64b2370b6f4b04f1af318f85f6ec247a3d4 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:47:17 +0530
Subject: [PATCH 14/19] NVTX RANGES macros commit

---
 cpp/src/io/json/nested_json.hpp | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 10d209b2ea6..34a071d3359 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -367,3 +367,27 @@ table_with_metadata host_parse_nested_json(
 }  // namespace detail
 
 }  // namespace cudf::io::json
+
+#define _CONCAT_(x, y) x##y
+#define CONCAT(x, y)   _CONCAT_(x, y)
+
+#define NVTX3_PUSH_RANGE_IN(D, tag)                                                            \
+  ::nvtx3::registered_message<D> const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \
+  ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__,                                   \
+                                         __LINE__){CONCAT(nvtx3_range_name__, __LINE__)};      \
+  nvtxDomainRangePushEx(::nvtx3::domain::get<D>(), CONCAT(nvtx3_range_attr__, __LINE__).get());
+
+#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get<D>());
+
+#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag)
+#define CUDF_POP_RANGE()     NVTX3_POP_RANGE(cudf::libcudf_domain)
+
+#define NVTX3_SCOPED_RANGE_IN(D, tag)                                                        \
+  ::nvtx3::registered_message<D> const CONCAT(nvtx3_scope_name__,                            \
+                                              __LINE__){std::string(__func__) + "::" + tag}; \
+  ::nvtx3::event_attributes const CONCAT(nvtx3_scope_attr__,                                 \
+                                         __LINE__){CONCAT(nvtx3_scope_name__, __LINE__)};    \
+  ::nvtx3::domain_thread_range<D> const CONCAT(nvtx3_range__,                                \
+                                               __LINE__){CONCAT(nvtx3_scope_attr__, __LINE__)};
+
+#define CUDF_SCOPED_RANGE(tag) NVTX3_SCOPED_RANGE_IN(cudf::libcudf_domain, tag)

From 3bb54f4dbafad77d48bef7f11e509c489c453227 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 7 Oct 2022 17:48:09 +0530
Subject: [PATCH 15/19] Revert "NVTX RANGES macros commit"

This reverts commit 5eefd64b2370b6f4b04f1af318f85f6ec247a3d4.
---
 cpp/src/io/json/nested_json.hpp | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 34a071d3359..10d209b2ea6 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -367,27 +367,3 @@ table_with_metadata host_parse_nested_json(
 }  // namespace detail
 
 }  // namespace cudf::io::json
-
-#define _CONCAT_(x, y) x##y
-#define CONCAT(x, y)   _CONCAT_(x, y)
-
-#define NVTX3_PUSH_RANGE_IN(D, tag)                                                            \
-  ::nvtx3::registered_message<D> const CONCAT(nvtx3_range_name__, __LINE__){std::string(tag)}; \
-  ::nvtx3::event_attributes const CONCAT(nvtx3_range_attr__,                                   \
-                                         __LINE__){CONCAT(nvtx3_range_name__, __LINE__)};      \
-  nvtxDomainRangePushEx(::nvtx3::domain::get<D>(), CONCAT(nvtx3_range_attr__, __LINE__).get());
-
-#define NVTX3_POP_RANGE(D) nvtxDomainRangePop(::nvtx3::domain::get<D>());
-
-#define CUDF_PUSH_RANGE(tag) NVTX3_PUSH_RANGE_IN(cudf::libcudf_domain, tag)
-#define CUDF_POP_RANGE()     NVTX3_POP_RANGE(cudf::libcudf_domain)
-
-#define NVTX3_SCOPED_RANGE_IN(D, tag)                                                        \
-  ::nvtx3::registered_message<D> const CONCAT(nvtx3_scope_name__,                            \
-                                              __LINE__){std::string(__func__) + "::" + tag}; \
-  ::nvtx3::event_attributes const CONCAT(nvtx3_scope_attr__,                                 \
-                                         __LINE__){CONCAT(nvtx3_scope_name__, __LINE__)};    \
-  ::nvtx3::domain_thread_range<D> const CONCAT(nvtx3_range__,                                \
-                                               __LINE__){CONCAT(nvtx3_scope_attr__, __LINE__)};
-
-#define CUDF_SCOPED_RANGE(tag) NVTX3_SCOPED_RANGE_IN(cudf::libcudf_domain, tag)

From 5a0a9a77f231735d683c0284d0c2af13b1b65b27 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 11 Oct 2022 09:58:14 +0530
Subject: [PATCH 16/19] address review comments (upsj)

---
 cpp/src/io/json/json_tree.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 6a94ea05e81..f00abae9111 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -146,12 +146,12 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
   CUDF_FUNC_RANGE();
 
   // Determine temporary device storage requirements
-  rmm::device_uvector<TreeDepthT> key1(key.size(), stream);
-  rmm::device_uvector<TreeDepthT> key2(key.size(), stream);
+  rmm::device_uvector<KeyType> key1(key.size(), stream);
+  rmm::device_uvector<KeyType> key2(key.size(), stream);
   rmm::device_uvector<IndexType> order1(key.size(), stream);
   rmm::device_uvector<IndexType> order2(key.size(), stream);
   cub::DoubleBuffer<IndexType> order(order1.data(), order2.data());
-  cub::DoubleBuffer<TreeDepthT> key_buffer(key1.data(), key2.data());
+  cub::DoubleBuffer<KeyType> key_buffer(key1.data(), key2.data());
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size());
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
@@ -272,8 +272,9 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 
     // previous push node_id
     // if previous node is a push, then i-1
-    // if previous node is FE, then i-2
+    // if previous node is FE, then i-2 (returns FB's index)
     // if previous node is SMB and its previous node is a push, then i-2
+    // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent.
     // else -1
     auto first_childs_parent_token_id = [tokens_gpu =
                                            tokens.begin()] __device__(auto i) -> NodeIndexT {

From a356ea01caf5cee10b37acafc669b4d3cfa6e6d7 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 14 Oct 2022 08:16:00 +0530
Subject: [PATCH 17/19] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/io/json/json_tree.cu | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index f00abae9111..1a9d3a01db8 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -129,29 +129,29 @@ struct node_ranges {
 };
 
 /**
- * @brief Returns stable sorted key and its sorted order
+ * @brief Returns stable sorted keys and its sorted order
  *
  * Uses cub stable radix sort
  *
  * @tparam IndexType sorted order type
  * @tparam KeyType key type
- * @param key key to sort
+ * @param keys keys to sort
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @return A pair of sorted key and its sorted order
+ * @return Sorted keys and indices producing that sorted order
  */
 template <typename IndexType = size_t, typename KeyType>
 std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
-  cudf::device_span<KeyType const> key, rmm::cuda_stream_view stream)
+  cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
 
   // Determine temporary device storage requirements
-  rmm::device_uvector<KeyType> key1(key.size(), stream);
-  rmm::device_uvector<KeyType> key2(key.size(), stream);
-  rmm::device_uvector<IndexType> order1(key.size(), stream);
-  rmm::device_uvector<IndexType> order2(key.size(), stream);
-  cub::DoubleBuffer<IndexType> order(order1.data(), order2.data());
-  cub::DoubleBuffer<KeyType> key_buffer(key1.data(), key2.data());
+  rmm::device_uvector<KeyType> keys_buffer1(key.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer2(key.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer1(key.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer2(key.size(), stream);
+  cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
+  cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size());
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
@@ -253,7 +253,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                            tokens.begin(),
                                            node_levels.begin(),
                                            is_node);
-    CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
+    CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, "node level count mismatch");
   }
 
   // Node parent ids:

From 7116570b1de8fa9c4e46d4945ccc67b0d2726edc Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 14 Oct 2022 08:22:43 +0530
Subject: [PATCH 18/19] address review comments

---
 cpp/src/io/json/json_tree.cu | 44 ++++++++++++++++++++----------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1a9d3a01db8..3d3fad41d76 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -146,24 +146,27 @@ std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_s
   CUDF_FUNC_RANGE();
 
   // Determine temporary device storage requirements
-  rmm::device_uvector<KeyType> keys_buffer1(key.size(), stream);
-  rmm::device_uvector<KeyType> keys_buffer2(key.size(), stream);
-  rmm::device_uvector<IndexType> order_buffer1(key.size(), stream);
-  rmm::device_uvector<IndexType> order_buffer2(key.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
   cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
   cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
   size_t temp_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, key_buffer, order, key.size());
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
   rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
 
-  thrust::copy(rmm::exec_policy(stream), key.begin(), key.end(), key1.begin());
-  thrust::sequence(rmm::exec_policy(stream), order1.begin(), order1.end());
+  thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
+  thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
 
   cub::DeviceRadixSort::SortPairs(
-    d_temp_storage.data(), temp_storage_bytes, key_buffer, order, key.size());
+    d_temp_storage.data(), temp_storage_bytes, keys_buffer, order_buffer, keys.size());
 
-  return std::pair{key_buffer.Current() == key1.data() ? std::move(key1) : std::move(key2),
-                   order.Current() == order1.data() ? std::move(order1) : std::move(order2)};
+  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
+                                                                : std::move(keys_buffer2),
+                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
+                                                                  : std::move(order_buffer2)};
 }
 
 /**
@@ -253,14 +256,14 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                            tokens.begin(),
                                            node_levels.begin(),
                                            is_node);
-    CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, "node level count mismatch");
+    CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes,
+                 "node level count mismatch");
   }
 
   // Node parent ids:
   // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
-  // This algorithms is a more generalized logical stack.
-  // TODO: make it own function.
   rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
+  // This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
   {
     rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
     thrust::copy_if(rmm::exec_policy(stream),
@@ -278,17 +281,18 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     // else -1
     auto first_childs_parent_token_id = [tokens_gpu =
                                            tokens.begin()] __device__(auto i) -> NodeIndexT {
-      if (i <= 0) return -1;
-      if (tokens_gpu[i - 1] == token_t::StructBegin || tokens_gpu[i - 1] == token_t::ListBegin)
+      if (i <= 0) { return -1; }
+      if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) {
         return i - 1;
-      else if (tokens_gpu[i - 1] == token_t::FieldNameEnd)
+      } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) {
         return i - 2;
-      else if (tokens_gpu[i - 1] == token_t::StructMemberBegin &&
-               (tokens_gpu[i - 2] == token_t::StructBegin ||
-                tokens_gpu[i - 2] == token_t::ListBegin))
+      } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and
+                 (tokens_gpu[i - 2] == token_t::StructBegin ||
+                  tokens_gpu[i - 2] == token_t::ListBegin)) {
         return i - 2;
-      else
+      } else {
         return -1;
+      }
     };
 
     thrust::transform(

From 8e0c85ff5c9a9ada87bb21d9327be88081fcf2f8 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 14 Oct 2022 08:53:39 +0530
Subject: [PATCH 19/19] add copy, memory savings comments for
 stable_sorted_key_order

---
 cpp/src/io/json/json_tree.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 3d3fad41d76..cf041b02a20 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -131,7 +131,9 @@ struct node_ranges {
 /**
  * @brief Returns stable sorted keys and its sorted order
  *
- * Uses cub stable radix sort
+ * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
+ * Since the key and order is returned, using double buffer helps to avoid extra copy to user
+ * provided output iterator.
  *
  * @tparam IndexType sorted order type
  * @tparam KeyType key type