diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
index ae1962e4738..0dd9854d4aa 100644
--- a/cpp/cmake/thrust.patch
+++ b/cpp/cmake/thrust.patch
@@ -114,3 +114,29 @@ index d0e3f94..76774b0 100644
  /**
   * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
   * implementation. This version allows using different token sequences for callables
+diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h
+index f512a36..a5f725d 100644
+--- a/thrust/iterator/transform_input_output_iterator.h
++++ b/thrust/iterator/transform_input_output_iterator.h
+@@ -102,6 +102,8 @@ template <typename InputFunction, typename OutputFunction, typename Iterator>
+   /*! \endcond
+    */
+ 
++  transform_input_output_iterator() = default;
++
+   /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+    * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+    *
+diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h
+index 66fb46a..4a68cb5 100644
+--- a/thrust/iterator/transform_output_iterator.h
++++ b/thrust/iterator/transform_output_iterator.h
+@@ -104,6 +104,8 @@ template <typename UnaryFunction, typename OutputIterator>
+   /*! \endcond
+    */
+ 
++  transform_output_iterator() = default;
++
+   /*! This constructor takes as argument an \c OutputIterator and an \c
+    * UnaryFunction and copies them to a new \p transform_output_iterator
+    *
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index cf21da1b030..f43089210fd 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -207,9 +207,31 @@ std::unique_ptr<column> rank(
 /**
  * @brief Returns sorted order after sorting each segment in the table.
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 2,1,0, 6,5,4,3, 9,8,7 }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a sequence of integers from 0 to keys.size()-1.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 0,1,2, 6,5,4,3, 7,8,9 }
+ * @endcode
+ *
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each
  * contiguous segment.
@@ -246,10 +268,34 @@ std::unique_ptr<column> stable_segmented_sorted_order(
 /**
  * @brief Performs a lexicographic segmented sort of a table
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'c','b','a', 'g','f','e','d', 'j','i','h' }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a copy of the values.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'a','b','c', 'g','f','e','d', 'h','i','j' }
+ * @endcode
+ *
  * @param values The table to reorder
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index d54bb5c8ea9..872e742a5af 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -403,7 +403,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      name = "element";
+      name = list_child_name;
     } else if (column_categories[parent_col_id] == NC_FN) {
       auto field_name_col_id = parent_col_id;
       parent_col_id          = column_parent_ids[parent_col_id];
@@ -689,19 +689,24 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       size_type num_rows = json_col.child_offsets.size() - 1;
       std::vector<column_name_info> column_names{};
       column_names.emplace_back("offsets");
-      column_names.emplace_back(json_col.child_columns.begin()->first);
+      column_names.emplace_back(
+        json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       // Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(
         data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release());
       // Create children column
       auto [child_column, names] =
-        device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                          d_input,
-                                          options,
-                                          get_child_schema(json_col.child_columns.begin()->first),
-                                          stream,
-                                          mr);
+        json_col.child_columns.empty()
+          ? std::pair<std::unique_ptr<column>,
+                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+          : device_json_column_to_cudf_column(
+              json_col.child_columns.begin()->second,
+              d_input,
+              options,
+              get_child_schema(json_col.child_columns.begin()->first),
+              stream,
+              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       return {make_lists_column(num_rows,
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index dbf026c351e..cf041b02a20 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -29,6 +29,8 @@
 
 #include <cuco/static_map.cuh>
 
+#include <cub/device/device_radix_sort.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -39,6 +41,7 @@
 #include <thrust/fill.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
@@ -125,6 +128,75 @@ struct node_ranges {
   }
 };
 
+/**
+ * @brief Returns stable sorted keys and its sorted order
+ *
+ * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
+ * Since the key and order is returned, using double buffer helps to avoid extra copy to user
+ * provided output iterator.
+ *
+ * @tparam IndexType sorted order type
+ * @tparam KeyType key type
+ * @param keys keys to sort
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return Sorted keys and indices producing that sorted order
+ */
+template <typename IndexType = size_t, typename KeyType>
+std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
+  cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  // Determine temporary device storage requirements
+  rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
+  cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
+  cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+
+  thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
+  thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
+
+  cub::DeviceRadixSort::SortPairs(
+    d_temp_storage.data(), temp_storage_bytes, keys_buffer, order_buffer, keys.size());
+
+  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
+                                                                : std::move(keys_buffer2),
+                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
+                                                                  : std::move(order_buffer2)};
+}
+
+/**
+ * @brief Propagate parent node to siblings from first sibling.
+ *
+ * @param node_levels Node levels of each node
+ * @param parent_node_ids parent node ids initialized for first child of each push node,
+ *                       and other siblings are initialized to -1.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
+                                  cudf::device_span<NodeIndexT> parent_node_ids,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
+  // instead of gather, using permutation_iterator, which is ~17% faster
+
+  thrust::inclusive_scan_by_key(
+    rmm::exec_policy(stream),
+    sorted_node_levels.begin(),
+    sorted_node_levels.end(),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::equal_to<TreeDepthT>{},
+    thrust::maximum<NodeIndexT>{});
+}
+
 // Generates a tree representation of the given tokens, token_indices.
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
@@ -166,12 +238,86 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   };
 
   auto num_tokens = tokens.size();
-  auto is_node_it = thrust::make_transform_iterator(
-    tokens.begin(),
-    [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
-  auto num_nodes = thrust::count_if(
+  auto num_nodes  = thrust::count_if(
     rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
 
+  // Node levels: transform_exclusive_scan, copy_if.
+  rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
+  {
+    rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
+    auto push_pop_it = thrust::make_transform_iterator(
+      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
+        return does_push(token) - does_pop(token);
+      });
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
+
+    auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
+                                           token_levels.begin(),
+                                           token_levels.begin() + num_tokens,
+                                           tokens.begin(),
+                                           node_levels.begin(),
+                                           is_node);
+    CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes,
+                 "node level count mismatch");
+  }
+
+  // Node parent ids:
+  // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
+  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
+  // This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
+  {
+    rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<NodeIndexT>(0),
+                    thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
+                    tokens.begin(),
+                    node_token_ids.begin(),
+                    is_node);
+
+    // previous push node_id
+    // if previous node is a push, then i-1
+    // if previous node is FE, then i-2 (returns FB's index)
+    // if previous node is SMB and its previous node is a push, then i-2
+    // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent.
+    // else -1
+    auto first_childs_parent_token_id = [tokens_gpu =
+                                           tokens.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) { return -1; }
+      if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) {
+        return i - 1;
+      } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) {
+        return i - 2;
+      } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and
+                 (tokens_gpu[i - 2] == token_t::StructBegin ||
+                  tokens_gpu[i - 2] == token_t::ListBegin)) {
+        return i - 2;
+      } else {
+        return -1;
+      }
+    };
+
+    thrust::transform(
+      rmm::exec_policy(stream),
+      node_token_ids.begin(),
+      node_token_ids.end(),
+      parent_node_ids.begin(),
+      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
+        NodeIndexT const tid) -> NodeIndexT {
+        auto pid = first_childs_parent_token_id(tid);
+        return pid < 0
+                 ? parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+        // parent_node_sentinel is -1, useful for segmented max operation below
+      });
+  }
+  // Propagate parent node to siblings from first sibling - inplace.
+  propagate_parent_to_siblings(
+    cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
+    parent_node_ids,
+    stream);
+
   // Node categories: copy_if with transform.
   rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
   auto node_categories_it =
@@ -184,24 +330,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
                "node category count mismatch");
 
-  // Node levels: transform_exclusive_scan, copy_if.
-  rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
-  auto push_pop_it = thrust::make_transform_iterator(
-    tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
-      return does_push(token) - does_pop(token);
-    });
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
-
-  rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
-  auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
-                                         token_levels.begin(),
-                                         token_levels.begin() + num_tokens,
-                                         tokens.begin(),
-                                         node_levels.begin(),
-                                         is_node);
-  CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
-
   // Node ranges: copy_if with transform.
   rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
   rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
@@ -223,69 +351,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                     });
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
-  // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
-  // reorder, copy_if. This one is sort of logical stack. But more generalized.
-  // TODO: make it own function.
-  rmm::device_uvector<size_type> parent_token_ids(num_tokens, stream);
-  rmm::device_uvector<size_type> initial_order(num_tokens, stream);
-  // TODO re-write the algorithm to work only on nodes, not tokens.
-
-  thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end());
-  thrust::tabulate(rmm::exec_policy(stream),
-                   parent_token_ids.begin(),
-                   parent_token_ids.end(),
-                   [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
-                     return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1;
-                     // -1, not sentinel used here because of max operation below
-                   });
-
-  auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data());
-  // Uses radix sort for builtin types.
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             token_levels.data(),
-                             token_levels.data() + token_levels.size(),
-                             out_pid);
-
-  // SegmentedScan Max.
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                token_levels.data(),
-                                token_levels.data() + token_levels.size(),
-                                parent_token_ids.data(),
-                                parent_token_ids.data(),
-                                thrust::equal_to<size_type>{},
-                                thrust::maximum<size_type>{});
-
-  // scatter to restore the original order.
-  {
-    rmm::device_uvector<size_type> temp_storage(num_tokens, stream);
-    thrust::scatter(rmm::exec_policy(stream),
-                    parent_token_ids.begin(),
-                    parent_token_ids.end(),
-                    initial_order.begin(),
-                    temp_storage.begin());
-    thrust::copy(
-      rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin());
-  }
-
-  rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
-
-  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
-  auto parent_node_ids_it = thrust::make_transform_iterator(
-    parent_token_ids.begin(),
-    [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
-      return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
-    });
-  auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream),
-                                             parent_node_ids_it,
-                                             parent_node_ids_it + parent_token_ids.size(),
-                                             tokens.begin(),
-                                             parent_node_ids.begin(),
-                                             is_node);
-  CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes,
-               "parent node id gather mismatch");
-
   return {std::move(node_categories),
           std::move(parent_node_ids),
           std::move(node_levels),
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 10d209b2ea6..8a0f3566d58 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -104,6 +104,9 @@ enum node_t : NodeT {
  */
 enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown };
 
+// Default name for a list's child column
+constexpr auto list_child_name{"element"};
+
 /**
  * @brief Intermediate representation of data from a nested JSON input
  */
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 5d60a564b9b..29a29a1f9d5 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1162,9 +1162,6 @@ void make_json_column(json_column& root_column,
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  // Default name for a list's child column
-  std::string const list_child_name = "element";
-
   // Parse the JSON and get the token stream
   const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
@@ -1286,7 +1283,7 @@ void make_json_column(json_column& root_column,
    * (b) a list, the selected child column corresponds to single child column of
    * the list column. In this case, the child column may not exist yet.
    */
-  auto get_selected_column = [&list_child_name](std::stack<tree_node>& current_data_path) {
+  auto get_selected_column = [](std::stack<tree_node>& current_data_path) {
     json_column* selected_col = current_data_path.top().current_selected_col;
 
     // If the node does not have a selected column yet
@@ -1680,7 +1677,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       size_type num_rows = json_col.child_offsets.size();
       std::vector<column_name_info> column_names{};
       column_names.emplace_back("offsets");
-      column_names.emplace_back(json_col.child_columns.begin()->first);
+      column_names.emplace_back(
+        json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       rmm::device_uvector<json_column::row_offset_t> d_offsets =
         cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr);
@@ -1688,12 +1686,15 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
         std::make_unique<column>(data_type{type_id::INT32}, num_rows, d_offsets.release());
       // Create children column
       auto [child_column, names] =
-        json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                   d_input,
-                                   options,
-                                   get_child_schema(json_col.child_columns.begin()->first),
-                                   stream,
-                                   mr);
+        json_col.child_columns.empty()
+          ? std::pair<std::unique_ptr<column>,
+                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+          : json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                       d_input,
+                                       options,
+                                       get_child_schema(json_col.child_columns.begin()->first),
+                                       stream,
+                                       mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       return {make_lists_column(num_rows - 1,
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index a5f6d737637..57d55be6145 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -1860,11 +1860,8 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
         out_buf.create(size, stream, mr);
       }
 
-      // for nested hierarchies, compute per-page start offset.
-      // it would be better/safer to be checking (schema.max_repetition_level > 0) here, but there's
-      // no easy way to get at that info here. we'd have to move this function into reader_impl.cu
-      if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) ||
-          out_buf.type.id() == type_id::LIST) {
+      // for nested hierarchies, compute per-page start offset
+      if (input_col.has_repetition) {
         thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
                                       page_keys.begin(),
                                       page_keys.end(),
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8f4cd5c6f3b..1a8c0f4cd9e 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -57,9 +57,16 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 struct input_column_info {
   int schema_idx;
   std::string name;
+  bool has_repetition;
   // size == nesting depth. the associated real output
   // buffer index in the dest column for each level of nesting.
   std::vector<int> nesting;
+
+  input_column_info(int _schema_idx, std::string _name, bool _has_repetition)
+    : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition)
+  {
+  }
+
   auto nesting_depth() const { return nesting.size(); }
 };
 
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 07869189089..0997d2a968d 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -684,8 +684,8 @@ class aggregate_reader_metadata {
         // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
         // data stored) so add me to the list.
         if (schema_elem.num_children == 0) {
-          input_column_info& input_col =
-            input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
+          input_column_info& input_col = input_columns.emplace_back(
+            input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
 
           // set up child output column for one-level encoding list
           if (schema_elem.is_one_level_list()) {
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index cb9cd4293b5..25094536cce 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -225,6 +225,7 @@ dremel_data get_dremel_data(column_view h_col,
   cudf::detail::device_single_thread(
     [offset_at_level  = d_column_offsets.data(),
      end_idx_at_level = d_column_ends.data(),
+     level_max        = d_column_offsets.size(),
      col              = *d_col] __device__() {
       auto curr_col           = col;
       size_type off           = curr_col.offset();
@@ -239,9 +240,11 @@ dremel_data get_dremel_data(column_view h_col,
         if (curr_col.type().id() == type_id::LIST) {
           off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
           end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
-          offset_at_level[level]  = off;
-          end_idx_at_level[level] = end;
-          ++level;
+          if (level < level_max) {
+            offset_at_level[level]  = off;
+            end_idx_at_level[level] = end;
+            ++level;
+          }
           curr_col = curr_col.child(lists_column_view::child_column_index);
         } else {
           curr_col = curr_col.child(0);
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 3422330bf8b..c5f13df5305 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -24,7 +24,6 @@
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
 namespace detail {
@@ -35,24 +34,49 @@ namespace {
  */
 enum class sort_method { STABLE, UNSTABLE };
 
-// returns segment indices for each element for all segments.
-// first segment begin index = 0, last segment end index = num_rows.
+/**
+ * @brief Builds indices to identify segments to sort
+ *
+ * The segments are added to the input table-view keys so they
+ * are lexicographically sorted within the segmented groups.
+ *
+ * ```
+ * Example 1:
+ * num_rows = 10
+ * offsets = {0, 3, 7, 10}
+ * segment-indices -> { 3,3,3, 7,7,7,7, 10,10,10 }
+ * ```
+ *
+ * ```
+ * Example 2: (offsets do not cover all indices)
+ * num_rows = 10
+ * offsets = {3, 7}
+ * segment-indices -> { 0,1,2, 7,7,7,7, 8,9,10 }
+ * ```
+ *
+ * @param num_rows Total number of rows in the input keys to sort
+ * @param offsets The offsets identifying the segments
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
 rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
                                                    column_view const& offsets,
                                                    rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<size_type> segment_ids(num_rows, stream);
 
-  auto offset_begin = offsets.begin<size_type>();  // assumes already offset column contains offset.
-  auto offsets_minus_one = thrust::make_transform_iterator(
-    offset_begin, [offset_begin] __device__(auto i) { return i - 1; });
+  auto offset_begin  = offsets.begin<size_type>();
+  auto offset_end    = offsets.end<size_type>();
   auto counting_iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::lower_bound(rmm::exec_policy(stream),
-                      offsets_minus_one,
-                      offsets_minus_one + offsets.size(),
-                      counting_iter,
-                      counting_iter + segment_ids.size(),
-                      segment_ids.begin());
+  thrust::transform(rmm::exec_policy(stream),
+                    counting_iter,
+                    counting_iter + segment_ids.size(),
+                    segment_ids.begin(),
+                    [offset_begin, offset_end] __device__(auto idx) {
+                      if (offset_begin == offset_end || idx < *offset_begin) { return idx; }
+                      if (idx >= *(offset_end - 1)) { return idx + 1; }
+                      return static_cast<size_type>(
+                        *thrust::upper_bound(thrust::seq, offset_begin, offset_end, idx));
+                    });
   return segment_ids;
 }
 
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index d7ab881861a..b8cd4622484 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -813,7 +813,6 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
-/*
 // currently, the json reader is strict about having non-empty input.
 TEST_F(JsonReaderTest, EmptyFile)
 {
@@ -824,7 +823,9 @@ TEST_F(JsonReaderTest, EmptyFile)
   }
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .experimental(true);
   auto result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
@@ -832,6 +833,7 @@ TEST_F(JsonReaderTest, EmptyFile)
 }
 
 // currently, the json reader is strict about having non-empty input.
+// experimental reader supports empty input
 TEST_F(JsonReaderTest, NoDataFile)
 {
   auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv";
@@ -841,13 +843,14 @@ TEST_F(JsonReaderTest, NoDataFile)
   }
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .experimental(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
-*/
 
 TEST_F(JsonReaderTest, ArrowFileSource)
 {
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 3d024fe8af8..6f7e28a2ca3 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -773,7 +773,11 @@ std::vector<std::string> json_lines_list = {
  { "a": { "y" : 6, "z": [] }}
  { "a": { "y" : 6, "z": [2, 3, 4, 5] }}
  { "a": { "z": [4], "y" : 6 }}
- { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"};
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )",
+  // empty list, row.
+  R"( {"a" : [], "b" : {}}
+ {"a" : []}
+ {"b" : {}})"};
 INSTANTIATE_TEST_SUITE_P(Mixed_And_Records,
                          JsonTreeTraversalTest,
                          ::testing::Combine(::testing::Values(false),
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 134eff54144..6f1c5ef7eb1 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -2633,6 +2633,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
     0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
   cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
                                                                     offset_iter + num_rows + 1);
+
+  auto _c3_valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
+  std::vector<bool> c3_valids(num_rows);
+  std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin());
   auto _c3_list =
     cudf::make_lists_column(num_rows,
                             offsets.release(),
@@ -2646,7 +2651,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   c3_children.push_back(std::move(c3_list));
   c3_children.push_back(c3_ints.release());
   c3_children.push_back(c3_floats.release());
-  cudf::test::structs_column_wrapper _c3(std::move(c3_children));
+  cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids);
   auto c3 = cudf::purge_nonempty_nulls(static_cast<cudf::structs_column_view>(_c3));
 
   // write it out
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 4fd62f9b938..a8547ea982d 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -655,9 +655,9 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   auto const input = fixed_width_column_wrapper<int32_t>{
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
-  auto const offsets   = std::vector<size_type>{0, 1, 3, 4};
+  auto const offsets   = std::vector<size_type>{1, 3, 4};
   auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<int32_t>{{1, 5, 4}, {true, true, true}};
+  auto const expect    = fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res = segmented_reduce(input,
                               d_offsets,
@@ -669,7 +669,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   // Test with initial value
   auto const init_scalar = cudf::make_fixed_width_scalar<int32_t>(3);
-  auto const init_expect = fixed_width_column_wrapper<int32_t>{{4, 8, 7}, {true, true, true}};
+  auto const init_expect = fixed_width_column_wrapper<int32_t>{{8, 7}, {true, true}};
 
   res = segmented_reduce(input,
                          d_offsets,
@@ -681,8 +681,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   // Test with null initial value
   init_scalar->set_valid_async(false);
-  auto null_init_expect =
-    fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX}, {false, false, false}};
+  auto null_init_expect = fixed_width_column_wrapper<int32_t>{{XXX, XXX}, {false, false}};
 
   res = segmented_reduce(input,
                          d_offsets,
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index fb07bfde795..53642a89b3d 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -201,9 +201,13 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart)
   column_wrapper<int> segments1{{0,    2,       5,       8,     11}};
   column_wrapper<int> segments2{{      2,       5,       8,      11}};
   column_wrapper<int> segments3{{                  6,    8,      11}};
+  column_wrapper<int> segments4{{                  6,    8}};
+  column_wrapper<int> segments5{{0,       3,       6}};
   column_wrapper<int> expected1{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}};
   column_wrapper<int> expected2{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}};
-  column_wrapper<int> expected3{{2, 4, 5, 3, 0, 1, 7, 6, 9, 10, 8}};
+  column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 7, 6, 9, 10, 8}};
+  column_wrapper<int> expected4{{0, 1, 2, 3, 4, 5, 7, 6, 8, 9, 10}};
+  column_wrapper<int> expected5{{2, 0, 1, 4, 5, 3, 6, 7, 8, 9, 10}};
   // clang-format on
   table_view input{{col1}};
   auto results = cudf::detail::segmented_sorted_order(input, segments1);
@@ -212,6 +216,10 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2);
   results = cudf::detail::segmented_sorted_order(input, segments3);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3);
+  results = cudf::detail::segmented_sorted_order(input, segments4);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4);
+  results = cudf::detail::segmented_sorted_order(input, segments5);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected5);
 }
 
 TEST_F(SegmentedSortInt, Sliced)
@@ -219,13 +227,13 @@ TEST_F(SegmentedSortInt, Sliced)
   using T = int;
   // clang-format off
   column_wrapper<T>        col1{{8, 9, 2, 3, 2, 2, 4, 1, 7, 5, 6}};
-  // sliced                      2, 2, 4, 1, 7, 5, 6
+  // sliced                                  2, 2, 4, 1, 7, 5, 6
   column_wrapper<int> segments1{{0,    2,       5}};
   column_wrapper<int> segments2{{-4,   0,      2,       5}};
   column_wrapper<int> segments3{{                 7}};
   column_wrapper<int> expected1{{0, 1, 3, 2, 4, 5, 6}};
   column_wrapper<int> expected2{{0, 1, 3, 2, 4, 5, 6}};
-  column_wrapper<int> expected3{{3, 0, 1, 2, 5, 6, 4}};
+  column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 6}};
   // clang-format on
   auto slice = cudf::slice(col1, {4, 11})[0];  // 7 elements
   table_view input{{slice}};
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 1fdef44546a..fb2c24b3757 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -649,6 +649,24 @@ def test_json_nested_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+def test_json_empty_types():
+    json_str = """ {}
+    {"a": [], "b": {}}
+    {"a": []}
+    {"b": {}}
+    {"c": {"d": []}}
+    {"e": [{}]}
+    """
+    df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        lines=True,
+    )
+    pdf = pd.read_json(StringIO(json_str), orient="records", lines=True)
+    assert_eq(df, pdf)
+
+
 def test_json_types_data():
     # 0:<0:string,1:float>
     # 1:list<int>
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 366b2e0ebae..0a0647f1297 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -451,7 +451,7 @@
 """
 doc_to_orc = docfmt_partial(docstring=_docstring_to_orc)
 
-_docstring_read_json = """
+_docstring_read_json = r"""
 Load a JSON dataset into a DataFrame
 
 Parameters
@@ -466,8 +466,13 @@
 engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters.
-orient : string,
-    Indication of expected JSON string format (pandas engine only).
+orient : string
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Indication of expected JSON string format.
     Compatible JSON strings can be produced by ``to_json()`` with a
     corresponding orient value.
     The set of possible orients is:
@@ -500,12 +505,23 @@
 typ : type of object to recover (series or frame), default 'frame'
     With cudf engine, only frame output is supported.
 dtype : boolean or dict, default True
-    If True, infer dtypes, if a dict of column to dtype, then use those,
-    if False, then don't infer dtypes at all, applies only to the data.
+    If True, infer dtypes for all columns; if False, then don't infer dtypes at all,
+    if a dict, provide a mapping from column names to their respective dtype (any missing
+    columns will have their dtype inferred). Applies only to the data.
 convert_axes : boolean, default True
-    Try to convert the axes to the proper dtypes (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Try to convert the axes to the proper dtypes.
 convert_dates : boolean, default True
-    List of columns to parse for dates (pandas engine only); If True, then try
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    List of columns to parse for dates; If True, then try
     to parse datelike columns default is True; a column label is datelike if
 
     * it ends with ``'_at'``,
@@ -514,27 +530,57 @@
     * it is ``'modified'``, or
     * it is ``'date'``
 keep_default_dates : boolean, default True
-    If parsing dates, parse the default datelike columns (pandas engine only)
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    If parsing dates, parse the default datelike columns.
 numpy : boolean, default False
-    Direct decoding to numpy arrays (pandas engine only). Supports numeric
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Direct decoding to numpy arrays. Supports numeric
     data only, but non-numeric column and index labels are supported. Note
     also that the JSON ordering MUST be the same for each term if numpy=True.
 precise_float : boolean, default False
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
     Set to enable usage of higher precision (strtod) function when
     decoding string to double values (pandas engine only). Default (False)
     is to use fast but less precise builtin functionality
 date_unit : string, default None
-    The timestamp unit to detect if converting dates (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    The timestamp unit to detect if converting dates.
     The default behavior is to try and detect the correct precision, but if
     this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force
     parsing only seconds, milliseconds, microseconds or nanoseconds.
 encoding : str, default is 'utf-8'
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
     The encoding to use to decode py3 bytes.
     With cudf engine, only utf-8 is supported.
 lines : boolean, default False
     Read the file as a json object per line.
 chunksize : integer, default None
-    Return JsonReader object for iteration (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Return JsonReader object for iteration.
     See the `line-delimited json docs
     <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
     for more information on ``chunksize``.
@@ -547,12 +593,22 @@
     otherwise. If using 'zip', the ZIP file must contain only one data
     file to be read in. Set to None for no decompression.
 byte_range : list or tuple, default None
-    Byte range within the input file to be read (cudf engine only).
+
+    .. admonition:: GPU-accelerated
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    Byte range within the input file to be read.
     The first number is the offset in bytes, the second number is the range
     size in bytes. Set the size to zero to read all data after the offset
     location. Reads the row that starts before or at the end of the range,
     even if it ends after the end of the range.
 keep_quotes : bool, default False
+
+    .. admonition:: GPU-accelerated experimental feature
+
+       This parameter is only supported with ``engine='cudf_experimental'``.
+
     This parameter is only supported in ``cudf_experimental`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).