diff --git a/CHANGELOG.md b/CHANGELOG.md
index de00213a6f6..b46ac22d767 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuDF 21.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
+
 # cuDF 21.10.00 (Date TBD)
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index e73153ce0c3..c2544ff7ffe 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -36,6 +36,9 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 # like `/tmp` is.
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
     logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
@@ -75,10 +78,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 # conda install "your-pkg=1.0.0"
 
 # Install the master version of dask, distributed, and streamz
-logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
 logger "pip install git+https://github.com/python-streamz/streamz.git@master --upgrade --no-deps"
 pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8e5b4d80115..7c5b9d836dd 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -101,8 +104,8 @@ function install_dask {
     # Install the main version of dask, distributed, and streamz
     gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
     # Need to uninstall streamz that is already in the env.
     pip uninstall -y streamz
     pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index bbe1ae70499..d5251b18582 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.10.*
+  - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
@@ -39,8 +39,8 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2021.6.0
-  - distributed>=2021.6.0
+  - dask=2021.09.1
+  - distributed=2021.09.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
   - transformers
   - pydata-sphinx-theme
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.09.1
+      - git+https://github.com/dask/distributed.git@2021.09.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index ed4c3ee2efc..7ab2cd60ce3 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.10.*
+  - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
@@ -39,8 +39,8 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2021.6.0
-  - distributed>=2021.6.0
+  - dask=2021.09.1
+  - distributed=2021.09.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
   - transformers
   - pydata-sphinx-theme
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.09.1
+      - git+https://github.com/dask/distributed.git@2021.09.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index d0965e97567..db8aa8e6c85 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -31,8 +31,8 @@ requirements:
     - python
     - streamz 
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
     - python-confluent-kafka
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 1b2c4efd610..45d96a2de85 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -26,13 +26,13 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 3fc1182b33b..f56610bea86 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -8,6 +8,15 @@ function logger() {
   echo -e "\n>>>> $@\n"
 }
 
+# Importing cudf on arm64 CPU only nodes is currently not working due to a
+# difference in reported gpu devices between arm64 and amd64
+ARCH=$(arch)
+
+if [ "${ARCH}" = "aarch64" ]; then
+  logger "Skipping tests on arm64"
+  exit 0
+fi
+
 # Install the latest version of dask and distributed
 logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
 pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0f05dcb4bb3..fd687de6698 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -93,6 +93,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/sequence.hpp
     - test -f $PREFIX/include/cudf/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
+    - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
     - test -f $PREFIX/include/cudf/detail/transform.hpp
     - test -f $PREFIX/include/cudf/detail/transpose.hpp
     - test -f $PREFIX/include/cudf/detail/unary.hpp
@@ -238,6 +239,7 @@ test:
     - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
     - test -f $PREFIX/include/cudf_test/cxxopts.hpp
     - test -f $PREFIX/include/cudf_test/file_utilities.hpp
+    - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
     - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
     - test -f $PREFIX/include/cudf_test/table_utilities.hpp
     - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c72c258fd18..982fee640d9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUDF)
 
-project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA)
+project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake,
 # causing subsequent runs to have different values for the `Threads::Threads` target.
@@ -236,8 +236,9 @@ add_library(cudf
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
     src/groupby/sort/group_rank_scan.cu
-    src/groupby/sort/group_sum_scan.cu
     src/groupby/sort/group_replace_nulls.cu
+    src/groupby/sort/group_sum_scan.cu
+    src/groupby/sort/group_tdigest.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/hash/md5_hash.cu
@@ -318,6 +319,7 @@ add_library(cudf
     src/merge/merge.cu
     src/partitioning/partitioning.cu
     src/partitioning/round_robin.cu
+    src/quantiles/tdigest/tdigest.cu
     src/quantiles/quantile.cu
     src/quantiles/quantiles.cu
     src/reductions/all.cu
@@ -565,6 +567,7 @@ add_library(cudftestutil STATIC
             tests/utilities/base_fixture.cpp
             tests/utilities/column_utilities.cu
             tests/utilities/table_utilities.cu
+            tests/io/metadata_utilities.cpp
             tests/strings/utilities.cu)
 
 set_target_properties(cudftestutil
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index cade101cbfd..16d50fd3388 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvcomp VERSION)
         GLOBAL_TARGETS     nvcomp::nvcomp
         CPM_ARGS
             GITHUB_REPOSITORY  NVIDIA/nvcomp
-            GIT_TAG            4f4e5713e69473be6e0c8ae483a932f666ae3c2f
+            GIT_TAG            aa003db89e052e4ce408910ff17e1054b7c43b7d
             OPTIONS            "BUILD_STATIC ON"
                                "BUILD_TESTS OFF"
                                "BUILD_BENCHMARKS OFF"
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 72524996a69..1141f20e3b1 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 21.10.00
+PROJECT_NUMBER         = 21.12.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.10
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.12
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index aef477c91e1..4175b34ff40 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CPM_DOWNLOAD_VERSION v0.32.2)
 file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-21.10)
+set(CUDF_TAG branch-21.12)
 CPMFindPackage(NAME  cudf
     GIT_REPOSITORY  https://github.com/rapidsai/cudf
     GIT_TAG         ${CUDF_TAG}
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index c302895880d..fb6401a3cc1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -87,7 +87,9 @@ class aggregation {
     CUDA,            ///< CUDA UDF based reduction
     MERGE_LISTS,     ///< merge multiple lists values into one list
     MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2         ///< merge partial values of M2 aggregation
+    MERGE_M2,        ///< merge partial values of M2 aggregation
+    TDIGEST,         ///< create a tdigest from a set of input values
+    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
   };
 
   aggregation() = delete;
@@ -493,5 +495,80 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a TDIGEST aggregation
+ *
+ * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values.
+ * The input aggregation values are expected to be fixed-width numeric types.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
+
+/**
+ * @brief Factory to create a MERGE_TDIGEST aggregation
+ *
+ * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation`
+ * or `make_merge_tdigest_aggregation` to produce a new a tdigest
+ * (https://arxiv.org/pdf/1902.04023.pdf) column.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A MERGE_TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4cf902ef562..05d1bf3e595 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -91,6 +91,10 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class tdigest_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_tdigest_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -125,6 +129,8 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class tdigest_aggregation const& agg);
+  virtual void visit(class merge_tdigest_aggregation const& agg);
 };
 
 /**
@@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying TDIGEST aggregation
+ */
+class tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit tdigest_aggregation(int max_centroids_)
+    : aggregation{TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation
+ */
+class merge_tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit merge_tdigest_aggregation(int max_centroids_)
+    : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -954,14 +1008,16 @@ template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<is_fixed_width<Source>() && !is_chrono<Source>() && (k == aggregation::MEAN)>> {
+  std::enable_if_t<is_fixed_width<Source>() && not is_chrono<Source>() &&
+                   not is_fixed_point<Source>() && (k == aggregation::MEAN)>> {
   using type = double;
 };
 
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<Source,
-                        k,
-                        std::enable_if_t<is_chrono<Source>() && (k == aggregation::MEAN)>> {
+struct target_type_impl<
+  Source,
+  k,
+  std::enable_if_t<(is_chrono<Source>() or is_fixed_point<Source>()) && (k == aggregation::MEAN)>> {
   using type = Source;
 };
 
@@ -1118,6 +1174,24 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Always use numeric types for TDIGEST
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::TDIGEST,
+                        std::enable_if_t<(is_numeric<Source>() || is_fixed_point<Source>())>> {
+  using type = struct_view;
+};
+
+// TDIGEST_MERGE. The root column type for a tdigest column is a list_view. Strictly
+// speaking, this check is not sufficient to guarantee we are actually being given a
+// real tdigest column, but we will do further verification inside the aggregation code.
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::MERGE_TDIGEST,
+                        std::enable_if_t<std::is_same_v<Source, cudf::struct_view>>> {
+  using type = struct_view;
+};
+
 /**
  * @brief Helper alias to get the accumulator type for performing aggregation
  * `k` on elements of type `Source`
@@ -1222,6 +1296,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::TDIGEST:
+      return f.template operator()<aggregation::TDIGEST>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_TDIGEST:
+      return f.template operator()<aggregation::MERGE_TDIGEST>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index fb5cfad6186..9f06661c8d1 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -75,6 +75,15 @@ std::vector<column_view> slice(column_view const& input,
                                std::vector<size_type> const& indices,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::slice(table_view const&,std::vector<size_type> const&)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
  * rmm::mr::device_memory_resource*)
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index a779c3defbb..ec83e348e33 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -145,5 +145,22 @@ struct row_lexicographic_tagged_comparator {
   order const* _column_order{};
 };
 
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 5fb2ce4cbe6..7a76f9cab88 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -22,7 +22,8 @@
 namespace cudf {
 namespace detail {
 
-/** @copydoc cudf::quantile()
+/**
+ * @copydoc cudf::quantile()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,8 @@ std::unique_ptr<column> quantile(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/** @copydoc cudf::quantiles()
+/**
+ * @copydoc cudf::quantiles()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -49,5 +51,17 @@ std::unique_ptr<table> quantiles(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::percentile_approx(column_view const&, column_view const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> percentile_approx(
+  column_view const& input,
+  column_view const& percentiles,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 3127a5f89f1..b5dfb34c043 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -32,7 +32,7 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -44,7 +44,7 @@ std::unique_ptr<column> sorted_order(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -90,5 +90,17 @@ std::unique_ptr<table> segmented_sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::sort
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> sort(
+  table_view const& values,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
new file mode 100644
index 00000000000..94c22911c1e
--- /dev/null
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+namespace tdigest {
+
+// mean and weight column indices within tdigest inner struct columns
+constexpr size_type mean_column_index   = 0;
+constexpr size_type weight_column_index = 1;
+
+// min and max column indices within tdigest outer struct columns
+constexpr size_type centroid_column_index = 0;
+constexpr size_type min_column_index      = 1;
+constexpr size_type max_column_index      = 2;
+
+/**
+ * @brief Verifies that the input column is a valid tdigest column.
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param col    Column to be checkeed
+ *
+ * @throws cudf::logic error if the column is not a valid tdigest column.
+ */
+void check_is_valid_tdigest_column(column_view const& col);
+
+/**
+ * @brief Create an empty tdigest column.
+ *
+ * An empty tdigest column contains a single row of length 0
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @returns An empty tdigest column.
+ */
+std::unique_ptr<column> make_empty_tdigest_column(
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace tdigest
+}  // namespace detail
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 4ae09b516a4..17d8e5eb7dd 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -389,7 +389,7 @@ class orc_writer_options {
   // Set of columns to output
   table_view _table;
   // Optional associated metadata
-  const table_metadata* _metadata = nullptr;
+  const table_input_metadata* _metadata = nullptr;
 
   friend orc_writer_options_builder;
 
@@ -445,7 +445,7 @@ class orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   // Setters
 
@@ -475,7 +475,7 @@ class orc_writer_options {
    *
    * @param meta Associated metadata.
    */
-  void set_metadata(table_metadata* meta) { _metadata = meta; }
+  void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
 };
 
 class orc_writer_options_builder {
@@ -541,7 +541,7 @@ class orc_writer_options_builder {
    * @param meta Associated metadata.
    * @return this for chaining.
    */
-  orc_writer_options_builder& metadata(table_metadata* meta)
+  orc_writer_options_builder& metadata(table_input_metadata const* meta)
   {
     options._metadata = meta;
     return *this;
@@ -570,6 +570,9 @@ class orc_writer_options_builder {
  *  cudf::io::write_orc(options);
  * @endcode
  *
+ * Note: Support for writing tables with struct columns is currently experimental, the output may
+ * not be as reliable as writing for other datatypes.
+ *
  * @param options Settings for controlling reading behavior.
  * @param mr Device memory resource to use for device memory allocation.
  */
@@ -592,7 +595,7 @@ class chunked_orc_writer_options {
   // Enable writing column statistics
   bool _enable_statistics = true;
   // Optional associated metadata
-  const table_metadata_with_nullability* _metadata = nullptr;
+  const table_input_metadata* _metadata = nullptr;
 
   friend chunked_orc_writer_options_builder;
 
@@ -638,7 +641,7 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata_with_nullability const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   // Setters
 
@@ -661,7 +664,7 @@ class chunked_orc_writer_options {
    *
    * @param meta Associated metadata.
    */
-  void metadata(table_metadata_with_nullability* meta) { _metadata = meta; }
+  void metadata(table_input_metadata const* meta) { _metadata = meta; }
 };
 
 class chunked_orc_writer_options_builder {
@@ -712,7 +715,7 @@ class chunked_orc_writer_options_builder {
    * @param meta Associated metadata.
    * @return this for chaining.
    */
-  chunked_orc_writer_options_builder& metadata(table_metadata_with_nullability* meta)
+  chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta)
   {
     options._metadata = meta;
     return *this;
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 25cbb6fd554..bc495c61d54 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -24,8 +24,6 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include <thrust/optional.h>
-
 #include <iostream>
 #include <memory>
 #include <string>
@@ -375,173 +373,6 @@ table_with_metadata read_parquet(
  * @{
  * @file
  */
-class table_input_metadata;
-
-class column_in_metadata {
-  friend table_input_metadata;
-  std::string _name = "";
-  thrust::optional<bool> _nullable;
-  // TODO: This isn't implemented yet
-  bool _list_column_is_map  = false;
-  bool _use_int96_timestamp = false;
-  // bool _output_as_binary = false;
-  thrust::optional<uint8_t> _decimal_precision;
-  std::vector<column_in_metadata> children;
-
- public:
-  /**
-   * @brief Get the children of this column metadata
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& add_child(column_in_metadata const& child)
-  {
-    children.push_back(child);
-    return *this;
-  }
-
-  /**
-   * @brief Set the name of this column
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& set_name(std::string const& name)
-  {
-    _name = name;
-    return *this;
-  }
-
-  /**
-   * @brief Set the nullability of this column
-   *
-   * Only valid in case of chunked writes. In single writes, this option is ignored.
-   *
-   * @return column_in_metadata&
-   */
-  column_in_metadata& set_nullability(bool nullable)
-  {
-    _nullable = nullable;
-    return *this;
-  }
-
-  /**
-   * @brief Specify that this list column should be encoded as a map in the written parquet file
-   *
-   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& set_list_column_as_map()
-  {
-    _list_column_is_map = true;
-    return *this;
-  }
-
-  /**
-   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
-   * physical type. Only valid for the following column types:
-   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
-   *
-   * @param req True = use int96 physical type. False = use int64 physical type
-   * @return this for chaining
-   */
-  column_in_metadata& set_int96_timestamps(bool req)
-  {
-    _use_int96_timestamp = req;
-    return *this;
-  }
-
-  /**
-   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
-   * (fixed-point) type
-   *
-   * @param precision The integer precision to set for this decimal column
-   * @return this for chaining
-   */
-  column_in_metadata& set_decimal_precision(uint8_t precision)
-  {
-    _decimal_precision = precision;
-    return *this;
-  }
-
-  /**
-   * @brief Get reference to a child of this column
-   *
-   * @param i Index of the child to get
-   * @return this for chaining
-   */
-  column_in_metadata& child(size_type i) { return children[i]; }
-
-  /**
-   * @brief Get const reference to a child of this column
-   *
-   * @param i Index of the child to get
-   * @return this for chaining
-   */
-  column_in_metadata const& child(size_type i) const { return children[i]; }
-
-  /**
-   * @brief Get the name of this column
-   */
-  std::string get_name() const { return _name; }
-
-  /**
-   * @brief Get whether nullability has been explicitly set for this column.
-   */
-  bool is_nullability_defined() const { return _nullable.has_value(); }
-
-  /**
-   * @brief Gets the explicitly set nullability for this column.
-   * @throws If nullability is not explicitly defined for this column.
-   *         Check using `is_nullability_defined()` first.
-   */
-  bool nullable() const { return _nullable.value(); }
-
-  /**
-   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
-   */
-  bool is_map() const { return _list_column_is_map; }
-
-  /**
-   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
-   */
-  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
-
-  /**
-   * @brief Get whether precision has been set for this decimal column
-   */
-  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
-
-  /**
-   * @brief Get the decimal precision that was set for this column.
-   * @throws If decimal precision was not set for this column.
-   *         Check using `is_decimal_precision_set()` first.
-   */
-  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
-
-  /**
-   * @brief Get the number of children of this column
-   */
-  size_type num_children() const { return children.size(); }
-};
-
-class table_input_metadata {
- public:
-  table_input_metadata() = default;  // Required by cython
-
-  /**
-   * @brief Construct a new table_input_metadata from a table_view.
-   *
-   * The constructed table_input_metadata has the same structure as the passed table_view
-   *
-   * @param table The table_view to construct metadata for
-   * @param user_data Optional Additional metadata to encode, as key-value pairs
-   */
-  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
-
-  std::vector<column_in_metadata> column_metadata;
-  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
-};
 
 /**
  * @brief Class to build `parquet_writer_options`.
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 661b36f10c8..ac965e2d416 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -23,6 +23,8 @@
 
 #include <cudf/types.hpp>
 
+#include <thrust/optional.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -125,34 +127,6 @@ struct table_metadata {
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
 };
 
-/**
- * @brief Derived class of table_metadata which includes flattened nullability information of input.
- *
- * This information is used as an optimization for chunked writes. If the caller leaves
- * column_nullable uninitialized, the writer code will assume the worst case : that all columns are
- * nullable.
- *
- * If the column_nullable field is not empty, it is expected that it has a length equal to the
- * number of columns in the flattened table being written.
- *
- * Flattening refers to the flattening of nested columns. For list columns, the number of values
- * expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of
- * three columns of types: {int, list<double>, float}, the nullability vector contains the values:
- *
- * |Index| Nullability of                         |
- * |-----|----------------------------------------|
- * |  0  | int column                             |
- * |  1  | Level 0 of list column (list itself)   |
- * |  2  | Level 1 of list column (double values) |
- * |  3  | float column                           |
- *
- * In the case where column nullability is known, pass `true` if the corresponding column could
- * contain nulls in one or more subtables to be written, otherwise `false`.
- */
-struct table_metadata_with_nullability : public table_metadata {
-  std::vector<bool> column_nullable;  //!< Per-column nullability information.
-};
-
 /**
  * @brief Table with table metadata used by io readers to return the metadata by value
  */
@@ -234,5 +208,174 @@ struct sink_info {
   }
 };
 
+class table_input_metadata;
+
+class column_in_metadata {
+  friend table_input_metadata;
+  std::string _name = "";
+  thrust::optional<bool> _nullable;
+  bool _list_column_is_map  = false;
+  bool _use_int96_timestamp = false;
+  // bool _output_as_binary = false;
+  thrust::optional<uint8_t> _decimal_precision;
+  std::vector<column_in_metadata> children;
+
+ public:
+  column_in_metadata() = default;
+  column_in_metadata(std::string_view name) : _name{name} {}
+  /**
+   * @brief Get the children of this column metadata
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& add_child(column_in_metadata const& child)
+  {
+    children.push_back(child);
+    return *this;
+  }
+
+  /**
+   * @brief Set the name of this column
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_name(std::string const& name)
+  {
+    _name = name;
+    return *this;
+  }
+
+  /**
+   * @brief Set the nullability of this column
+   *
+   * Only valid in case of chunked writes. In single writes, this option is ignored.
+   *
+   * @return column_in_metadata&
+   */
+  column_in_metadata& set_nullability(bool nullable)
+  {
+    _nullable = nullable;
+    return *this;
+  }
+
+  /**
+   * @brief Specify that this list column should be encoded as a map in the written parquet file
+   *
+   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_list_column_as_map()
+  {
+    _list_column_is_map = true;
+    return *this;
+  }
+
+  /**
+   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
+   * physical type. Only valid for the following column types:
+   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
+   *
+   * @param req True = use int96 physical type. False = use int64 physical type
+   * @return this for chaining
+   */
+  column_in_metadata& set_int96_timestamps(bool req)
+  {
+    _use_int96_timestamp = req;
+    return *this;
+  }
+
+  /**
+   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
+   * (fixed-point) type
+   *
+   * @param precision The integer precision to set for this decimal column
+   * @return this for chaining
+   */
+  column_in_metadata& set_decimal_precision(uint8_t precision)
+  {
+    _decimal_precision = precision;
+    return *this;
+  }
+
+  /**
+   * @brief Get reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata& child(size_type i) { return children[i]; }
+
+  /**
+   * @brief Get const reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata const& child(size_type i) const { return children[i]; }
+
+  /**
+   * @brief Get the name of this column
+   */
+  std::string get_name() const { return _name; }
+
+  /**
+   * @brief Get whether nullability has been explicitly set for this column.
+   */
+  bool is_nullability_defined() const { return _nullable.has_value(); }
+
+  /**
+   * @brief Gets the explicitly set nullability for this column.
+   * @throws If nullability is not explicitly defined for this column.
+   *         Check using `is_nullability_defined()` first.
+   */
+  bool nullable() const { return _nullable.value(); }
+
+  /**
+   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
+   */
+  bool is_map() const { return _list_column_is_map; }
+
+  /**
+   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
+   */
+  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+
+  /**
+   * @brief Get whether precision has been set for this decimal column
+   */
+  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+
+  /**
+   * @brief Get the decimal precision that was set for this column.
+   * @throws If decimal precision was not set for this column.
+   *         Check using `is_decimal_precision_set()` first.
+   */
+  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+
+  /**
+   * @brief Get the number of children of this column
+   */
+  size_type num_children() const { return children.size(); }
+};
+
+class table_input_metadata {
+ public:
+  table_input_metadata() = default;  // Required by cython
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_view.
+   *
+   * The constructed table_input_metadata has the same structure as the passed table_view
+   *
+   * @param table The table_view to construct metadata for
+   * @param user_data Optional Additional metadata to encode, as key-value pairs
+   */
+  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+
+  std::vector<column_in_metadata> column_metadata;
+  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
+
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
index f1ce3b7f0e3..e778428510d 100644
--- a/cpp/include/cudf/lists/drop_list_duplicates.hpp
+++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp
@@ -28,32 +28,32 @@ namespace lists {
  */
 
 /**
- * @brief Create a new lists column by removing duplicated entries from each list element in the
- * given lists column
+ * @brief Create a new lists column by extracting unique entries from list elements in the given
+ * lists column.
  *
- * @throw cudf::logic_error if any row (list element) in the input column is a nested type.
- *
- * Given an `input` lists_column_view, the list elements in the column are copied to an output lists
+ * Given an input lists column, the list elements in the column are copied to an output lists
  * column such that their duplicated entries are dropped out to keep only the unique ones. The
  * order of those entries within each list are not guaranteed to be preserved as in the input. In
  * the current implementation, entries in the output lists are sorted by ascending order (nulls
  * last), but this is not guaranteed in future implementation.
  *
- * @param lists_column The input lists_column_view
- * @param nulls_equal  Flag to specify whether null entries should be considered equal
- * @param nans_equal   Flag to specify whether NaN entries should be considered as equal value (only
- * applicable for floating point data column)
- * @param mr           Device resource used to allocate memory
+ * @throw cudf::logic_error if the child column of the input lists column contains nested type other
+ * than struct.
+ *
+ * @param lists_column The input lists column to extract lists with unique entries.
+ * @param nulls_equal Flag to specify whether null entries should be considered equal.
+ * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only
+ *        applicable for floating point data column).
+ * @param mr Device resource used to allocate memory.
  *
  * @code{.pseudo}
- * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
+ * input  = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
  * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
  *
- * Note that permuting the entries of each list in this output also produces another valid
- * output.
+ * Note that permuting the entries of each list in this output also produces another valid output.
  * @endcode
  *
- * @return A list column with list elements having unique entries
+ * @return A lists column with list elements having unique entries.
  */
 std::unique_ptr<column> drop_list_duplicates(
   lists_column_view const& lists_column,
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 94b5c344f4f..d21f6dff79c 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -94,5 +95,32 @@ std::unique_ptr<table> quantiles(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Calculate approximate percentiles on an input tdigest column.
+ *
+ * tdigest (https://arxiv.org/pdf/1902.04023.pdf) columns are produced specifically
+ * by the TDIGEST and MERGE_TDIGEST aggregations.  These columns represent
+ * compressed representations of a very large input data set that can be
+ * queried for quantile information.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest from `input` row `i`. The length of each output list
+ * is the number of percentages specified in `percentages`.
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @throws cudf::logic_error if `input` is not a valid tdigest column.
+ * @throws cudf::logic_error if `percentiles` is not a FLOAT64 column.
+ *
+ * @returns LIST Column containing requested percentile values as FLOAT64.
+ */
+std::unique_ptr<column> percentile_approx(
+  structs_column_view const& input,
+  column_view const& percentiles,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 36a8131a78e..69eb8b3490a 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -58,7 +58,7 @@ enum class rank_method {
  * `input` if it were sorted
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -72,7 +72,7 @@ std::unique_ptr<column> sorted_order(
  * @copydoc cudf::sorted_order
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -112,7 +112,7 @@ bool is_sorted(cudf::table_view const& table,
  * @return New table containing the desired sorted order of `input`
  */
 std::unique_ptr<table> sort(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index 458587946f2..cbd09fa7b0d 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -104,6 +104,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type
 /**
  * @brief Indicates whether the specified bit is set to `1`
  *
+ * @param bitmask The bitmask containing the bit to clear
  * @param bit_index Index of the bit to test
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
@@ -114,6 +115,23 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type
   return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index));
 }
 
+/**
+ * @brief optional-like interface to check if a specified bit of a bitmask is set.
+ *
+ * @param bitmask The bitmask containing the bit to clear
+ * @param bit_index Index of the bit to test
+ * @param default_value Value to return if `bitmask` is nullptr
+ * @return true The specified bit is `1`
+ * @return false  The specified bit is `0`
+ * @return `default_value` if `bitmask` is nullptr
+ */
+CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
+                                            size_type bit_index,
+                                            bool default_value)
+{
+  return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value;
+}
+
 /**
  * @brief Returns a bitmask word with the `n` least significant bits set.
  *
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index f4e7e3e2a6d..40a833112e1 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -142,6 +142,31 @@ constexpr inline bool is_equality_comparable()
   return detail::is_equality_comparable_impl<L, R>::value;
 }
 
+namespace detail {
+/**
+ * @brief Helper functor to check if a specified type `T` supports equality comparisons.
+ */
+struct unary_equality_comparable_functor {
+  template <typename T>
+  bool operator()() const
+  {
+    return cudf::is_equality_comparable<T, T>();
+  }
+};
+}  // namespace detail
+
+/**
+ * @brief Checks whether `data_type` `type` supports equality comparisons.
+ *
+ * @param type Data_type for comparison.
+ * @return true If `type` supports equality comparisons.
+ * @return false If `type` does not support equality comparisons.
+ */
+inline bool is_equality_comparable(data_type type)
+{
+  return cudf::type_dispatcher(type, detail::unary_equality_comparable_functor{});
+}
+
 /**
  * @brief Indicates whether the type `T` is a numeric type.
  *
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 553d8a97bd2..aa77686fee4 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -38,6 +38,8 @@ enum class debug_output_level {
   QUIET             // no debug output
 };
 
+constexpr size_type default_ulp = 4;
+
 /**
  * @brief Verifies the property equality of two columns.
  *
@@ -93,12 +95,15 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  * @param lhs The first column
  * @param rhs The second column
  * @param verbosity Level of debug output verbosity
+ * @param fp_ulps # of ulps of tolerance to allow when comparing
+ * floating point values
  *
  * @returns True if the columns (and their properties) are equivalent, false otherwise
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity = debug_output_level::FIRST_ERROR);
+                               debug_output_level verbosity = debug_output_level::FIRST_ERROR,
+                               size_type fp_ulps            = cudf::test::default_ulp);
 
 /**
  * @brief Verifies the bitwise equality of two device memory buffers.
diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp
new file mode 100644
index 00000000000..6ca6eba6884
--- /dev/null
+++ b/cpp/include/cudf_test/io_metadata_utilities.hpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/io/types.hpp>
+
+namespace cudf::test {
+
+void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
+                           cudf::io::table_metadata out_meta);
+
+}
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 020f5c76c10..9f060c93215 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-project(CUDA_KAFKA VERSION 21.10.00 LANGUAGES CXX)
+project(CUDA_KAFKA VERSION 21.12.00 LANGUAGES CXX)
 
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index c3d992e1181..b550b61785b 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -202,6 +202,18 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 // aggregation_finalizer ----------------------------------------
 
 void aggregation_finalizer::visit(aggregation const& agg) {}
@@ -346,6 +358,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 }  // namespace detail
 
 std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
@@ -668,6 +690,25 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::merge_tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
+  int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_merge_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
 namespace detail {
 namespace {
 struct target_type_functor {
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index 0e41689dc4b..d1c12056393 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -63,17 +63,9 @@ std::vector<column_view> slice(column_view const& input,
   return std::vector<column_view>{begin, begin + indices.size() / 2};
 }
 
-}  // namespace detail
-
-std::vector<cudf::column_view> slice(cudf::column_view const& input,
-                                     std::vector<size_type> const& indices)
-{
-  CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, rmm::cuda_stream_default);
-}
-
-std::vector<cudf::table_view> slice(cudf::table_view const& input,
-                                    std::vector<size_type> const& indices)
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
@@ -81,7 +73,7 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
 
   // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j]
   // where i is the i'th column of the j'th table_view
-  auto op = [&indices](auto const& c) { return cudf::slice(c, indices); };
+  auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); };
   auto f  = thrust::make_transform_iterator(input.begin(), op);
 
   auto sliced_table = std::vector<std::vector<cudf::column_view>>(f, f + input.num_columns());
@@ -99,6 +91,22 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
   }
 
   return result;
-};
+}
+
+}  // namespace detail
+
+std::vector<cudf::column_view> slice(cudf::column_view const& input,
+                                     std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
+
+std::vector<cudf::table_view> slice(cudf::table_view const& input,
+                                    std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 3da20fb9af3..2804dea576e 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-20, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 533f193d692..bdaccba38dc 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -27,10 +27,12 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -76,6 +78,9 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
     // Optionally flatten nested key columns.
     auto [flattened_keys, _, __, ___] =
       flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
+    auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
+    CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
+                 "Unsupported groupby key type does not support equality comparison");
     auto [grouped_keys, results] =
       detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
     return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys),
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 726b51b7702..9f3d67ac38b 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -525,6 +525,97 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 };
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::tdigest_aggregation const&>(agg).max_centroids;
+
+  auto count_agg = make_count_aggregation();
+  operator()<aggregation::COUNT_VALID>(*count_agg);
+  column_view valid_counts = cache.get_result(col_idx, *count_agg);
+
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_tdigest(
+                     get_sorted_values(),
+                     helper.group_offsets(stream),
+                     helper.group_labels(stream),
+                     {valid_counts.begin<size_type>(), static_cast<size_t>(valid_counts.size())},
+                     helper.num_groups(stream),
+                     max_centroids,
+                     stream,
+                     mr));
+};
+
+/**
+ * @brief Generate a merged tdigest column from a grouped set of input tdigest columns.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_merge_tdigest(get_grouped_values(),
+                                               helper.group_offsets(stream),
+                                               helper.group_labels(stream),
+                                               helper.num_groups(stream),
+                                               max_centroids,
+                                               stream,
+                                               mr));
+};
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 2770162da2d..cb01ee8e053 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -442,6 +442,94 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped (and sorted) values to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param group_valid_counts Per-group counts of valid elements.
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_tdigest(column_view const& values,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Merges tdigests within the same group to generate a new tdigest.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped tdigests to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_merge_tdigest(column_view const& values,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
+
 /** @endinternal
  *
  */
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 8eccadd653e..db2ae5b5d8e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -31,77 +31,50 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf {
 namespace groupby {
 namespace detail {
 
-// ArgMin binary operator with tuple of (value, index)
+/**
+ * @brief ArgMin binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
+ */
 template <typename T>
 struct ArgMin {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-// ArgMax binary operator with tuple of (value, index)
-template <typename T>
-struct ArgMax {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-struct get_tuple_second_element {
-  template <typename T>
-  __device__ size_type operator()(thrust::tuple<T, size_type> const& rhs) const
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    return thrust::get<1>(rhs);
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
   }
 };
 
 /**
- * @brief Functor to store the boolean value to null mask.
+ * @brief ArgMax binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
  */
-struct bool_to_nullmask {
-  mutable_column_device_view d_result;
-  __device__ void operator()(size_type i, bool rhs)
+template <typename T>
+struct ArgMax {
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    if (rhs) {
-      d_result.set_valid(i);
-    } else {
-      d_result.set_null(i);
-    }
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
   }
 };
 
-/**
- * @brief Returns index for non-null element, and SENTINEL for null element in a column.
- *
- */
-struct null_as_sentinel {
-  column_device_view const col;
-  size_type const SENTINEL;
-  __device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -191,25 +164,16 @@ struct reduce_functor {
     auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
     auto valuesview = column_device_view::create(values, stream);
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
-      constexpr auto SENTINEL =
-        (K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL);
-      auto idx_begin =
-        cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL});
-      // dictionary keys are sorted, so dictionary32 index comparison is enough.
-      auto column_begin = valuesview->begin<DeviceType>();
-      auto begin        = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin));
-      auto result_begin = thrust::make_transform_output_iterator(resultview->begin<ResultDType>(),
-                                                                 get_tuple_second_element{});
       using OpType =
         std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            begin,
+                            thrust::make_counting_iterator<ResultType>(0),
                             thrust::make_discard_iterator(),
-                            result_begin,
-                            thrust::equal_to<size_type>{},
-                            OpType{});
+                            resultview->begin<ResultType>(),
+                            thrust::equal_to<ResultType>{},
+                            OpType{*valuesview});
     } else {
       auto init  = OpType::template identity<DeviceType>();
       auto begin = cudf::detail::make_counting_transform_iterator(
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
new file mode 100644
index 00000000000..5b4252a9063
--- /dev/null
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/merge.cuh>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+namespace {
+
+// the most representative point within a cluster of similar
+// values. {mean, weight}
+// NOTE: Using a tuple here instead of a struct to take advantage of
+// thrust zip iterators for output.
+using centroid = thrust::tuple<double, double, bool>;
+
+// make a centroid from a scalar with a weight of 1.
+template <typename T>
+struct make_centroid {
+  column_device_view const col;
+
+  centroid operator() __device__(size_type index)
+  {
+    return {static_cast<double>(col.element<T>(index)), 1, col.is_valid(index)};
+  }
+};
+
+// make a centroid from an input stream of mean/weight values.
+struct make_weighted_centroid {
+  double const* mean;
+  double const* weight;
+
+  centroid operator() __device__(size_type index) { return {mean[index], weight[index], true}; }
+};
+
+// merge two centroids
+struct merge_centroids {
+  centroid operator() __device__(centroid const& lhs, centroid const& rhs)
+  {
+    bool const lhs_valid = thrust::get<2>(lhs);
+    bool const rhs_valid = thrust::get<2>(rhs);
+    if (!lhs_valid && !rhs_valid) { return {0, 0, false}; }
+    if (!lhs_valid) { return rhs; }
+    if (!rhs_valid) { return lhs; }
+
+    double const lhs_mean   = thrust::get<0>(lhs);
+    double const rhs_mean   = thrust::get<0>(rhs);
+    double const lhs_weight = thrust::get<1>(lhs);
+    double const rhs_weight = thrust::get<1>(rhs);
+    double const new_weight = lhs_weight + rhs_weight;
+    return {(lhs_mean * lhs_weight + rhs_mean * rhs_weight) / new_weight, new_weight, true};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the nearest weight that will be <= the next limit is simply the nearest integer < the limit,
+ * which we can get by just taking floor(next_limit).  For example if our next limit is 3.56, the
+ * nearest whole number <= it is floor(3.56) == 3.
+ */
+struct nearest_value_scalar_weights {
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type)
+  {
+    double const f = floor(next_limit);
+    return {f, max(0, static_cast<int>(next_limit) - 1)};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes we are dealing with grouped, sorted, weighted centroids.
+ */
+struct nearest_value_centroid_weights {
+  double const* cumulative_weights;
+  offset_type const* outer_offsets;  // groups
+  offset_type const* inner_offsets;  // tdigests within a group
+
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index)
+  {
+    auto const tdigest_begin = outer_offsets[group_index];
+    auto const tdigest_end   = outer_offsets[group_index + 1];
+    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+
+    auto const index = ((thrust::lower_bound(thrust::seq,
+                                             group_cumulative_weights,
+                                             group_cumulative_weights + num_weights,
+                                             next_limit)) -
+                        group_cumulative_weights);
+
+    return index == 0 ? thrust::pair<double, int>{0, 0}
+                      : thrust::pair<double, int>{group_cumulative_weights[index - 1], index - 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input values.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the cumulative weight for a given value index I is simply I+1.
+ */
+struct cumulative_scalar_weight {
+  cudf::device_span<size_type const> group_offsets;
+  cudf::device_span<size_type const> group_labels;
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const group_index          = group_labels[value_index];
+    auto const relative_value_index = value_index - group_offsets[group_index];
+    return {group_index, relative_value_index, relative_value_index + 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input centroids.
+ *
+ * This functor assumes we are dealing with grouped, weighted centroids.
+ */
+struct cumulative_centroid_weight {
+  double const* cumulative_weights;
+  cudf::device_span<size_type const> group_labels;
+  offset_type const* outer_offsets;                    // groups
+  cudf::device_span<offset_type const> inner_offsets;  // tdigests with a group
+
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const tdigest_index =
+      static_cast<size_type>(
+        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
+        inner_offsets.begin()) -
+      1;
+    auto const group_index                 = group_labels[tdigest_index];
+    auto const first_tdigest_index         = outer_offsets[group_index];
+    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const relative_value_index        = value_index - first_weight_index;
+    double const* group_cumulative_weights = cumulative_weights + first_weight_index;
+
+    return {group_index, relative_value_index, group_cumulative_weights[relative_value_index]};
+  }
+};
+
+// a monotonically increasing scale function which produces a distribution
+// of centroids that is more densely packed in the middle of the input
+// than at the ends.
+__device__ double scale_func_k1(double quantile, double delta_norm)
+{
+  double k = delta_norm * asin(2.0 * quantile - 1.0);
+  k += 1.0;
+  double q = (sin(k / delta_norm) + 1.0) / 2.0;
+  return q;
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated. 1 thread
+ * per group.
+ *
+ * This kernel is called in a two-pass style.  Once to compute the per-group
+ * cluster sizes and total # of clusters, and once to compute the actual
+ * weight limits per cluster.
+ *
+ * @param delta_              tdigest compression level
+ * @param num_groups          The number of input groups
+ * @param nearest_weight_     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight_       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
+ * @param group_num_clusters  Output.  The number of output clusters for each input group.
+ * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
+ *
+ */
+template <typename TotalWeightIter, typename NearestWeightFunc, typename CumulativeWeight>
+__global__ void generate_cluster_limits_kernel(int delta_,
+                                               size_type num_groups,
+                                               NearestWeightFunc nearest_weight,
+                                               TotalWeightIter total_weight_,
+                                               CumulativeWeight cumulative_weight,
+                                               double* group_cluster_wl,
+                                               size_type* group_num_clusters,
+                                               offset_type const* group_cluster_offsets)
+{
+  int const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const group_index = tid;
+  if (group_index >= num_groups) { return; }
+
+  // we will generate at most delta clusters.
+  double const delta              = static_cast<double>(delta_);
+  double const delta_norm         = delta / (2.0 * M_PI);
+  double const total_weight       = total_weight_[group_index];
+  group_num_clusters[group_index] = 0;
+  // a group with nothing in it.
+  if (total_weight <= 0) { return; }
+
+  // start at the correct place based on our cluster offset.
+  double* cluster_wl =
+    group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr;
+
+  double cur_limit        = 0.0;
+  double cur_weight       = 0.0;
+  double next_limit       = -1.0;
+  int last_inserted_index = -1;
+
+  // compute the first cluster limit
+  double nearest_w;
+  int nearest_w_index;
+  while (1) {
+    cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
+    if (cur_weight >= total_weight) { break; }
+
+    // based on where we are closing the cluster off (not including the incoming weight),
+    // compute the next cluster limit
+    double const quantile = cur_weight / total_weight;
+    next_limit            = total_weight * scale_func_k1(quantile, delta_norm);
+
+    // if the next limit is < the cur limit, we're past the end of the distribution, so we're done.
+    if (next_limit <= cur_limit) {
+      if (cluster_wl) { cluster_wl[group_num_clusters[group_index]] = total_weight; }
+      group_num_clusters[group_index]++;
+      break;
+    }
+
+    // compute the weight we will be at in the input values just before closing off the current
+    // cluster (because adding the next value will cross the current limit).
+    // NOTE: can't use structured bindings here.
+    thrust::tie(nearest_w, nearest_w_index) = nearest_weight(next_limit, group_index);
+
+    if (cluster_wl) {
+      // because of the way the scale functions work, it is possible to generate clusters
+      // in such a way that we end up with "gaps" where there are no input values that
+      // fall into a given cluster.  An example would be this:
+      //
+      // cluster weight limits = 0.00003, 1.008, 3.008
+      //
+      // input values(weight) = A(1), B(2), C(3)
+      //
+      // naively inserting these values into the clusters simply by taking a lower_bound,
+      // we would get the following distribution of input values into those 3 clusters.
+      //  (), (A), (B,C)
+      //
+      // whereas what we really want is:
+      //
+      //  (A), (B), (C)
+      //
+      // to fix this, we will artificially adjust the output cluster limits to guarantee
+      // at least 1 input value will be put in each cluster during the reduction step.
+      // this does not affect final centroid results as we still use the "real" weight limits
+      // to compute subsequent clusters - the purpose is only to allow cluster selection
+      // during the reduction step to be trivial.
+      //
+      double adjusted_next_limit = next_limit;
+      if (nearest_w_index == last_inserted_index || last_inserted_index < 0) {
+        nearest_w_index       = last_inserted_index + 1;
+        auto [r, i, adjusted] = cumulative_weight(nearest_w_index);
+        adjusted_next_limit   = max(next_limit, adjusted);
+      }
+      cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit;
+      last_inserted_index                         = nearest_w_index;
+    }
+    group_num_clusters[group_index]++;
+    cur_limit = next_limit;
+  }
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated.
+ *
+ * @param delta_             tdigest compression level
+ * @param num_groups         The number of input groups
+ * @param nearest_weight     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tuple containing the set of cluster weight limits for each group, a set of
+ * list-style offsets indicating group sizes, and the total number of clusters
+ */
+template <typename TotalWeightIter, typename NearestWeight, typename CumulativeWeight>
+std::tuple<rmm::device_uvector<double>, std::unique_ptr<column>, size_type>
+generate_group_cluster_info(int delta,
+                            size_type num_groups,
+                            NearestWeight nearest_weight,
+                            TotalWeightIter total_weight,
+                            CumulativeWeight cumulative_weight,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(num_groups, block_size);
+
+  // compute number of clusters per group
+  // each thread computes 1 set of clusters (# of cluster sets == # of groups)
+  rmm::device_uvector<size_type> group_num_clusters(num_groups, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    nullptr,
+    group_num_clusters.begin(),
+    nullptr);
+
+  // generate group cluster offsets (where the clusters for a given group start and end)
+  auto group_cluster_offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+  auto cluster_size = cudf::detail::make_counting_transform_iterator(
+    0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) {
+      return index == num_groups ? 0 : group_num_clusters[index];
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         cluster_size,
+                         cluster_size + num_groups + 1,
+                         group_cluster_offsets->mutable_view().begin<offset_type>(),
+                         0);
+
+  // total # of clusters
+  offset_type total_clusters =
+    cudf::detail::get_value<offset_type>(group_cluster_offsets->view(), num_groups, stream);
+
+  // fill in the actual cluster weight limits
+  rmm::device_uvector<double> group_cluster_wl(total_clusters, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    group_cluster_wl.begin(),
+    group_num_clusters.begin(),
+    group_cluster_offsets->view().begin<offset_type>());
+
+  return {std::move(group_cluster_wl),
+          std::move(group_cluster_offsets),
+          static_cast<size_type>(total_clusters)};
+}
+
+/**
+ * @brief Compute a column of tdigests.
+ *
+ * Assembles the output tdigest column based on the specified delta, a stream of
+ * input values (either scalar or centroids), and an assortment of per-group
+ * clustering information.
+ *
+ * This function is effectively just a reduce_by_key that performs a reduction
+ * from input values -> centroid clusters as defined by the the cluster weight
+ * boundaries.
+ *
+ * @param delta              tdigest compression level
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
+ * @param cumulative_weight  Functor which returns cumulative weight and group information for
+ * an absolute input value index.
+ * @param min_col            Column containing the minimum value per group.
+ * @param max_col            Column containing the maximum value per group.
+ * @param group_cluster_wl   Cluster weight limits for each group.
+ * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
+ * @param total_clusters     Total number of clusters in all groups.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tdigest column with 1 row per output tdigest.
+ */
+template <typename CentroidIter, typename CumulativeWeight>
+std::unique_ptr<column> compute_tdigests(int delta,
+                                         CentroidIter centroids_begin,
+                                         CentroidIter centroids_end,
+                                         CumulativeWeight group_cumulative_weight,
+                                         std::unique_ptr<column>&& min_col,
+                                         std::unique_ptr<column>&& max_col,
+                                         rmm::device_uvector<double> const& group_cluster_wl,
+                                         std::unique_ptr<column>&& group_cluster_offsets,
+                                         size_type total_clusters,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  // the output for each group is column of data that represents the tdigest. since we want 1 row
+  // per group, each row will be a list the length of the tdigest for that group. so our output
+  // column is of the form:
+  // struct {
+  //   centroids for the digest
+  //   list {
+  //     struct {
+  //       double    // mean
+  //       double    // weight
+  //     }
+  //   }
+  //   double       // min
+  //   double       // max
+  // }
+  //
+  //
+  if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  std::vector<std::unique_ptr<column>> inner_children;
+  // mean
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // weight
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // tdigest struct
+  auto tdigests =
+    cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr);
+
+  // each input group represents an individual tdigest.  within each tdigest, we want the keys
+  // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
+  // into the range 0-99).  But since we have multiple tdigests, we need to keep the keys unique
+  // between the groups, so we add our group start offset.
+  auto keys = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [delta,
+     group_cluster_wl      = group_cluster_wl.data(),
+     group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
+     group_cumulative_weight] __device__(size_type value_index) -> size_type {
+      auto [group_index, relative_value_index, cumulative_weight] =
+        group_cumulative_weight(value_index);
+
+      // compute start of cluster weight limits for this group
+      double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
+      auto const num_clusters =
+        group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index];
+
+      // local cluster index
+      size_type const group_cluster_index =
+        min(num_clusters - 1,
+            static_cast<size_type>(
+              thrust::lower_bound(
+                thrust::seq, weight_limits, weight_limits + num_clusters, cumulative_weight) -
+              weight_limits));
+
+      // add the cluster offset to generate a globally unique key
+      return group_cluster_index + group_cluster_offsets[group_index];
+    });
+
+  // reduce the centroids down by key.
+  cudf::mutable_column_view mean_col =
+    tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view();
+  cudf::mutable_column_view weight_col =
+    tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view();
+  auto output           = thrust::make_zip_iterator(thrust::make_tuple(
+    mean_col.begin<double>(), weight_col.begin<double>(), thrust::make_discard_iterator()));
+  auto const num_values = std::distance(centroids_begin, centroids_end);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        keys,
+                        keys + num_values,                // keys
+                        centroids_begin,                  // values
+                        thrust::make_discard_iterator(),  // key output
+                        output,                           // output
+                        thrust::equal_to<size_type>{},    // key equality check
+                        merge_centroids{});
+
+  // create the list
+  auto const num_groups = group_cluster_offsets->size() - 1;
+  auto list             = cudf::make_lists_column(
+    num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {});
+
+  // create final tdigest column
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr);
+}
+
+// retrieve total weight of scalar inputs by group index
+struct scalar_total_weight {
+  size_type const* group_valid_counts;
+  __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; }
+};
+
+// return the min/max value of scalar inputs by group index
+template <typename T>
+struct get_scalar_minmax {
+  column_device_view const col;
+  device_span<size_type const> group_offsets;
+  size_type const* group_valid_counts;
+
+  __device__ thrust::tuple<double, double> operator()(size_type group_index)
+  {
+    // note: .element<T>() is taking care of fixed-point conversions for us.
+    return {static_cast<double>(col.element<T>(group_offsets[group_index])),
+            static_cast<double>(
+              col.element<T>(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))};
+  }
+};
+
+struct typed_group_tdigest {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    // first, generate cluster weight information for each input group
+    auto total_weight = cudf::detail::make_counting_transform_iterator(
+      0, scalar_total_weight{group_valid_counts.begin()});
+    auto [group_cluster_wl, group_cluster_offsets, total_clusters] =
+      generate_group_cluster_info(delta,
+                                  num_groups,
+                                  nearest_value_scalar_weights{},
+                                  total_weight,
+                                  cumulative_scalar_weight{group_offsets, group_labels},
+                                  stream,
+                                  mr);
+
+    // device column view. handy because the .element() function
+    // automatically handles fixed-point conversions for us
+    auto d_col = cudf::column_device_view::create(col);
+
+    // compute min and max columns
+    auto min_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    auto max_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + num_groups,
+      thrust::make_zip_iterator(thrust::make_tuple(min_col->mutable_view().begin<double>(),
+                                                   max_col->mutable_view().begin<double>())),
+      get_scalar_minmax<T>{*d_col, group_offsets, group_valid_counts.begin()});
+
+    // for simple input values, the "centroids" all have a weight of 1.
+    auto scalar_to_centroid =
+      cudf::detail::make_counting_transform_iterator(0, make_centroid<T>{*d_col});
+
+    // generate the final tdigest
+    return compute_tdigests(delta,
+                            scalar_to_centroid,
+                            scalar_to_centroid + col.size(),
+                            cumulative_scalar_weight{group_offsets, group_labels},
+                            std::move(min_col),
+                            std::move(max_col),
+                            group_cluster_wl,
+                            std::move(group_cluster_offsets),
+                            total_clusters,
+                            stream,
+                            mr);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Non-numeric type in group_tdigest");
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<column> group_tdigest(column_view const& col,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+
+  auto const delta = max_centroids;
+  return cudf::type_dispatcher(col.type(),
+                               typed_group_tdigest{},
+                               col,
+                               group_offsets,
+                               group_labels,
+                               group_valid_counts,
+                               num_groups,
+                               delta,
+                               stream,
+                               mr);
+}
+
+std::unique_ptr<column> group_merge_tdigest(column_view const& input,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(input);
+
+  if (num_groups == 0 || input.size() == 0) {
+    return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
+  }
+
+  structs_column_view scv(input);
+  lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  // ideally, we would just call .parent().child() here because tdigests cannot be
+  // sliced. however, lists_column_view() hides that particular interface. However,
+  // for the same reason, get_sliced_child() should be just as cheap.
+  auto data = lcv.get_sliced_child(stream);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(cudf::detail::tdigest::mean_column_index);
+  auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index);
+
+  // first step is to merge all the tdigests in each group. at the moment the only way to
+  // make this work is to retrieve the group sizes (via group_offsets) and the individual digest
+  // sizes (via input.offsets()) to the gpu and do the merges.  The scale problem is that while the
+  // size of each group will likely be small (size of each group will typically map to # of batches
+  // the input data was chopped into for tdigest generation), the -number- of groups can be
+  // arbitrarily large.
+  //
+  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
+  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
+  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
+  // outer groups.
+
+  // bring group offsets back to the host
+  std::vector<size_type> h_outer_offsets(group_offsets.size());
+  cudaMemcpyAsync(h_outer_offsets.data(),
+                  group_offsets.data(),
+                  sizeof(size_type) * group_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  // bring tdigest offsets back to the host
+  auto tdigest_offsets = lcv.offsets();
+  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
+  cudaMemcpyAsync(h_inner_offsets.data(),
+                  tdigest_offsets.begin<size_type>(),
+                  sizeof(size_type) * tdigest_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  stream.synchronize();
+
+  // extract all means and weights into a table
+  cudf::table_view tdigests_unsliced({mean, weight});
+
+  // generate the merged (but not yet compressed) tdigests for each group.
+  std::vector<std::unique_ptr<table>> tdigests;
+  tdigests.reserve(num_groups);
+  std::transform(
+    h_outer_offsets.begin(),
+    h_outer_offsets.end() - 1,
+    std::next(h_outer_offsets.begin()),
+    std::back_inserter(tdigests),
+    [&](auto tdigest_start, auto tdigest_end) {
+      // the range of tdigests in this group
+      auto const num_tdigests = tdigest_end - tdigest_start;
+
+      // slice each tdigest from the input
+      std::vector<table_view> unmerged_tdigests;
+      unmerged_tdigests.reserve(num_tdigests);
+      auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
+      std::transform(offset_iter,
+                     offset_iter + num_tdigests,
+                     std::next(offset_iter),
+                     std::back_inserter(unmerged_tdigests),
+                     [&](auto start, auto end) {
+                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
+                     });
+
+      // merge
+      return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, {}, stream, mr);
+    });
+
+  // generate min and max values
+  auto min_col        = scv.child(cudf::detail::tdigest::min_column_index);
+  auto merged_min_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        min_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_min_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::minimum<double>{});
+
+  auto max_col        = scv.child(cudf::detail::tdigest::max_column_index);
+  auto merged_max_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        max_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_max_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::maximum<double>{});
+
+  // concatenate all the merged tdigests back into one table.
+  std::vector<table_view> tdigest_views;
+  tdigest_views.reserve(num_groups);
+  std::transform(tdigests.begin(),
+                 tdigests.end(),
+                 std::back_inserter(tdigest_views),
+                 [](std::unique_ptr<table> const& t) { return t->view(); });
+  auto merged = cudf::detail::concatenate(tdigest_views, stream, mr);
+
+  // generate cumulative weights
+  auto merged_weights     = merged->get_column(cudf::detail::tdigest::weight_column_index).view();
+  auto cumulative_weights = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED);
+  auto keys = cudf::detail::make_counting_transform_iterator(
+    0,
+    [group_labels      = group_labels.begin(),
+     inner_offsets     = tdigest_offsets.begin<size_type>(),
+     num_inner_offsets = tdigest_offsets.size()] __device__(int index) {
+      // what -original- tdigest index this absolute index corresponds to
+      auto const iter = thrust::prev(
+        thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
+      auto const tdigest_index = thrust::distance(inner_offsets, iter);
+
+      // what group index the original tdigest belongs to
+      return group_labels[tdigest_index];
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + cumulative_weights->size(),
+                                merged_weights.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto const delta = max_centroids;
+
+  // generate cluster info
+  auto total_group_weight = cudf::detail::make_counting_transform_iterator(
+    0,
+    [outer_offsets = group_offsets.data(),
+     inner_offsets = tdigest_offsets.begin<size_type>(),
+     cumulative_weights =
+       cumulative_weights->view().begin<double>()] __device__(size_type group_index) {
+      auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1;
+      return cumulative_weights[last_weight_index];
+    });
+  auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
+    delta,
+    num_groups,
+    nearest_value_centroid_weights{cumulative_weights->view().begin<double>(),
+                                   group_offsets.data(),
+                                   tdigest_offsets.begin<size_type>()},
+    total_group_weight,
+    cumulative_centroid_weight{
+      cumulative_weights->view().begin<double>(),
+      group_labels,
+      group_offsets.data(),
+      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    stream,
+    mr);
+
+  // input centroid values
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0,
+    make_weighted_centroid{
+      merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin<double>(),
+      merged_weights.begin<double>()});
+
+  // compute the tdigest
+  return compute_tdigests(delta,
+                          centroids,
+                          centroids + merged->num_rows(),
+                          cumulative_centroid_weight{cumulative_weights->view().begin<double>(),
+                                                     group_labels,
+                                                     group_offsets.data(),
+                                                     {tdigest_offsets.begin<offset_type>(),
+                                                      static_cast<size_t>(tdigest_offsets.size())}},
+                          std::move(merged_min_col),
+                          std::move(merged_max_col),
+                          group_cluster_wl,
+                          std::move(group_cluster_offsets),
+                          total_clusters,
+                          stream,
+                          mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 69d68f7b6bc..c4905b86ab9 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -23,8 +23,10 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -102,6 +104,9 @@ sort_groupby_helper::sort_groupby_helper(table_view const& keys,
 
   auto [flattened_keys, _, __, struct_null_vectors] =
     flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
+  auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
+  CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
+               "Unsupported groupby key type does not support equality comparison");
   _struct_null_vectors = std::move(struct_null_vectors);
   _keys                = flattened_keys;
 
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index eeafd959f87..c9b6c6e9f91 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -146,7 +146,7 @@ __global__ void __launch_bounds__(block_size, 2)
 
   if (t == 0) {
     s->chunk             = chunks[group_id][str_col_idx];
-    s->chunk.leaf_column = &orc_columns[col_idx].cudf_column;
+    s->chunk.leaf_column = &orc_columns[col_idx];
     s->chunk.dict_data   = dict_data[str_col_idx].data() + rowgroup_bounds[group_id][col_idx].begin;
     s->chunk.dict_index  = dict_index[str_col_idx].data();
     s->chunk.start_row   = rowgroup_bounds[group_id][col_idx].begin;
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 77de0b0b286..405bf7c2ecc 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -615,9 +615,13 @@ class metadata {
 /**
  * @brief `column_device_view` and additional, ORC specific, information on the column.
  */
-struct orc_column_device_view {
-  column_device_view cudf_column;
+struct orc_column_device_view : public column_device_view {
+  __device__ orc_column_device_view(column_device_view col, thrust::optional<uint32_t> parent_idx)
+    : column_device_view{col}, parent_index{parent_idx}
+  {
+  }
   thrust::optional<uint32_t> parent_index;
+  bitmask_type const* pushdown_mask = nullptr;
 };
 
 /**
diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h
index ab6788d01f1..eedaa9d4fc2 100644
--- a/cpp/src/io/orc/orc_common.h
+++ b/cpp/src/io/orc/orc_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ namespace orc {
 
 // ORC rows are divided into groups and assigned indexes for faster seeking
 static constexpr uint32_t default_row_index_stride = 10000;
+static constexpr uint32_t BLOCK_HEADER_SIZE        = 3;
 
 enum CompressionKind : uint8_t {
   NONE   = 0,
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 30687331c15..389895abc83 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -135,6 +135,8 @@ struct RowGroup {
 struct EncChunk {
   uint32_t start_row;                // start row of this chunk
   uint32_t num_rows;                 // number of rows in this chunk
+  uint32_t null_mask_start_row;      // adjusted to multiple of 8
+  uint32_t null_mask_num_rows;       // adjusted to multiple of 8
   ColumnEncodingKind encoding_kind;  // column encoding kind
   TypeKind type_kind;                // column data type
   uint8_t dtype_len;                 // data type length
@@ -142,7 +144,7 @@ struct EncChunk {
 
   uint32_t* dict_index;  // dictionary index from row index
   uint32_t* decimal_offsets;
-  column_device_view const* leaf_column;
+  orc_column_device_view const* column;
 };
 
 /**
@@ -182,7 +184,7 @@ struct DictionaryChunk {
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
 
-  column_device_view const* leaf_column;  //!< Pointer to string column
+  orc_column_device_view const* leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -197,7 +199,7 @@ struct StripeDictionary {
   uint32_t num_strings;      // number of unique strings in the dictionary
   uint32_t dict_char_count;  // total size of dictionary string data
 
-  column_device_view const* leaf_column;  //!< Pointer to string column
+  orc_column_device_view const* leaf_column;  //!< Pointer to string column
 };
 
 constexpr uint32_t encode_block_size = 512;
@@ -326,17 +328,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes,
                               device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream);
 
-/**
- * @brief Set leaf column element of EncChunk
- *
- * @param[in] orc_columns Pre-order flattened device array of ORC column views
- * @param[in,out] chunks encoder chunk device array [column][rowgroup]
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- */
-void set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                       device_2dspan<EncChunk> chunks,
-                       rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
  *
@@ -355,6 +346,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] num_compressed_blocks Total number of compressed blocks
  * @param[in] compression Type of compression
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[out] comp_in Per-block compression input parameters
@@ -365,10 +357,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream);
 
 /**
@@ -438,6 +431,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups,
  * @param[in,out] groups Statistics merge groups
  * @param[in,out] chunks Statistics data
  * @param[in] statistics_count Number of statistics buffers
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void orc_encode_statistics(uint8_t* blob_bfr,
                            statistics_merge_group* groups,
@@ -445,6 +439,19 @@ void orc_encode_statistics(uint8_t* blob_bfr,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream);
 
+/**
+ * @brief Number of set bits in pushdown masks, per rowgroup.
+ *
+ * @param[in] orc_columns Pre-order flattened device array of ORC column views
+ * @param[in] rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
+ * @param[out] set_counts Per rowgroup number of set bits
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ */
+void reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
+                           device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                           device_2dspan<cudf::size_type> set_counts,
+                           rmm::cuda_stream_view stream);
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index d50d3898c3b..cc7e22f2042 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -24,6 +24,9 @@
 
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <nvcomp/snappy.h>
 
 namespace cudf {
 namespace io {
@@ -262,7 +265,6 @@ static __device__ uint32_t ByteRLE(
     }
   }
   if (!t) { s->strm_pos[cid] = static_cast<uint32_t>(dst - s->stream.data_ptrs[cid]); }
-  __syncthreads();
   return out_cnt;
 }
 
@@ -618,6 +620,100 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
 static const __device__ __constant__ int32_t kTimeScale[10] = {
   1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
 
+template <int block_size, typename Storage>
+static __device__ void encode_null_mask(orcenc_state_s* s,
+                                        bitmask_type const* pushdown_mask,
+                                        Storage& scan_storage,
+                                        int t)
+{
+  if (s->stream.ids[CI_PRESENT] < 0) return;
+
+  auto const column = *s->chunk.column;
+  while (s->present_rows < s->chunk.null_mask_num_rows or s->numvals > 0) {
+    // Number of rows read so far
+    auto present_rows = s->present_rows;
+    // valid_buf capacity is byte per thread in block
+    auto const buf_available_bits = encode_block_size * 8 - s->numvals;
+    // Number of rows for the block to process in this iteration
+    auto const nrows = min(s->chunk.null_mask_num_rows - present_rows, buf_available_bits);
+    // Number of rows for this thread to process in this iteration
+    auto const t_nrows = min(max(static_cast<int32_t>(nrows) - t * 8, 0), 8);
+    auto const row     = s->chunk.null_mask_start_row + present_rows + t * 8;
+
+    auto get_mask_byte = [&](bitmask_type const* mask, size_type offset) -> uint8_t {
+      if (t_nrows == 0) return 0;
+      if (mask == nullptr) return 0xff;
+
+      auto const begin_offset = row + offset;
+      auto const end_offset   = min(begin_offset + 8, offset + column.size());
+      auto const mask_word = cudf::detail::get_mask_offset_word(mask, 0, begin_offset, end_offset);
+      return mask_word & 0xff;
+    };
+
+    uint8_t pd_byte     = (1 << t_nrows) - 1;
+    uint32_t pd_set_cnt = t_nrows;
+    uint32_t offset     = t_nrows != 0 ? t * 8 : nrows;
+    if (pushdown_mask != nullptr) {
+      pd_byte    = get_mask_byte(pushdown_mask, 0) & ((1 << t_nrows) - 1);
+      pd_set_cnt = __popc(pd_byte);
+      // Scan the number of valid bits to get dst offset for each thread
+      cub::BlockScan<uint32_t, block_size>(scan_storage).ExclusiveSum(pd_set_cnt, offset);
+    }
+
+    auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
+    auto dst_offset      = offset + s->nnz;
+    auto vbuf_bit_idx    = [](int row) {
+      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      return row % (encode_block_size * 8);
+    };
+    if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
+      s->valid_buf[vbuf_bit_idx(dst_offset) / 8] = mask_byte;
+    } else {
+      for (auto bit_idx = 0; bit_idx < t_nrows; ++bit_idx) {
+        // skip bits where pushdown mask is not set
+        if (not(pd_byte & (1 << bit_idx))) continue;
+        if (mask_byte & (1 << bit_idx)) {
+          set_bit(reinterpret_cast<uint32_t*>(s->valid_buf), vbuf_bit_idx(dst_offset++));
+        } else {
+          clear_bit(reinterpret_cast<uint32_t*>(s->valid_buf), vbuf_bit_idx(dst_offset++));
+        }
+      }
+    }
+
+    __syncthreads();
+    if (t == block_size - 1) {
+      // Number of loaded rows, available for encode
+      s->numvals += offset + pd_set_cnt;
+      // Number of loaded rows (different from present_rows because of pushdown masks)
+      s->nnz += offset + pd_set_cnt;
+    }
+    present_rows += nrows;
+    if (!t) { s->present_rows = present_rows; }
+    __syncthreads();
+
+    // RLE encode the present stream
+    if (s->numvals > ((present_rows < s->chunk.null_mask_num_rows) ? 130 * 8 : 0)) {
+      auto const flush      = (present_rows < s->chunk.null_mask_num_rows) ? 0 : 7;
+      auto const nbytes_out = (s->numvals + flush) / 8;
+      auto const nrows_encoded =
+        ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
+
+      if (!t) {
+        // Number of rows enocoded so far
+        s->present_out += nrows_encoded;
+        s->numvals -= min(s->numvals, nrows_encoded);
+      }
+      __syncthreads();
+    }
+  }
+
+  // reset shared state
+  if (t == 0) {
+    s->nnz     = 0;
+    s->numvals = 0;
+  }
+}
+
 /**
  * @brief Encode column data
  *
@@ -632,6 +728,7 @@ __global__ void __launch_bounds__(block_size)
 {
   __shared__ __align__(16) orcenc_state_s state_g;
   __shared__ union {
+    typename cub::BlockScan<uint32_t, block_size>::TempStorage scan_u32;
     typename cub::BlockReduce<int32_t, block_size>::TempStorage i32;
     typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
     typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
@@ -643,120 +740,74 @@ __global__ void __launch_bounds__(block_size)
   uint32_t group_id       = blockIdx.y;
   int t                   = threadIdx.x;
   if (t == 0) {
-    s->chunk  = chunks[col_id][group_id];
-    s->stream = streams[col_id][group_id];
-  }
-  if (t < CI_NUM_STREAMS) { s->strm_pos[t] = 0; }
-  __syncthreads();
-  if (!t) {
-    s->cur_row      = 0;
-    s->present_rows = 0;
-    s->present_out  = 0;
-    s->numvals      = 0;
-    s->numlengths   = 0;
-    s->nnz          = 0;
+    s->chunk                = chunks[col_id][group_id];
+    s->stream               = streams[col_id][group_id];
+    s->cur_row              = 0;
+    s->present_rows         = 0;
+    s->present_out          = 0;
+    s->numvals              = 0;
+    s->numlengths           = 0;
+    s->nnz                  = 0;
+    s->strm_pos[CI_DATA]    = 0;
+    s->strm_pos[CI_PRESENT] = 0;
+    s->strm_pos[CI_INDEX]   = 0;
     // Dictionary data is encoded in a separate kernel
-    if (s->chunk.encoding_kind == DICTIONARY_V2) {
-      s->strm_pos[CI_DATA2]      = s->stream.lengths[CI_DATA2];
-      s->strm_pos[CI_DICTIONARY] = s->stream.lengths[CI_DICTIONARY];
-    }
+    s->strm_pos[CI_DATA2] =
+      s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DATA2] : 0;
+    s->strm_pos[CI_DICTIONARY] =
+      s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DICTIONARY] : 0;
   }
+  __syncthreads();
 
-  auto validity_byte = [&] __device__(int row) -> uint8_t& {
-    // valid_buf is a circular buffer where validitiy of 8 rows is stored in each element
-    return s->valid_buf[(row / 8) % encode_block_size];
-  };
-
-  auto validity = [&] __device__(int row) -> uint32_t {
-    // Check if the specific bit is set in the validity buffer
-    return (validity_byte(row) >> (row % 8)) & 1;
-  };
+  auto const pushdown_mask = [&]() -> cudf::bitmask_type const* {
+    auto const parent_index = s->chunk.column->parent_index;
+    if (!parent_index.has_value()) return nullptr;
+    return chunks[parent_index.value()][0].column->pushdown_mask;
+  }();
 
+  encode_null_mask<block_size>(s, pushdown_mask, temp_storage.scan_u32, t);
   __syncthreads();
+
+  auto const column = *s->chunk.column;
   while (s->cur_row < s->chunk.num_rows || s->numvals + s->numlengths != 0) {
-    // Encode valid map
-    if (s->present_rows < s->chunk.num_rows) {
-      uint32_t present_rows = s->present_rows;
-      uint32_t nrows =
-        min(s->chunk.num_rows - present_rows,
-            encode_block_size * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7)));
-      uint32_t nrows_out;
-      if (t * 8 < nrows) {
-        auto const row_in_group = present_rows + t * 8;
-        auto const row          = s->chunk.start_row + row_in_group;
-        uint8_t valid           = 0;
-        if (row < s->chunk.leaf_column->size()) {
-          if (s->chunk.leaf_column->nullable()) {
-            auto const current_valid_offset = row + s->chunk.leaf_column->offset();
-            auto const last_offset =
-              min(current_valid_offset + 8,
-                  s->chunk.leaf_column->offset() + s->chunk.leaf_column->size());
-            auto const mask = cudf::detail::get_mask_offset_word(
-              s->chunk.leaf_column->null_mask(), 0, current_valid_offset, last_offset);
-            valid = 0xff & mask;
-          } else {
-            valid = 0xff;
-          }
-          if (row + 7 > s->chunk.leaf_column->size()) {
-            valid = valid & ((1 << (s->chunk.leaf_column->size() - row)) - 1);
-          }
-        }
-        validity_byte(row_in_group) = valid;
-      }
-      __syncthreads();
-      present_rows += nrows;
-      if (!t) { s->present_rows = present_rows; }
-      // RLE encode the present stream
-      nrows_out = present_rows - s->present_out;  // Should always be a multiple of 8 except at
-                                                  // the end of the last row group
-      if (nrows_out > ((present_rows < s->chunk.num_rows) ? 130 * 8 : 0)) {
-        uint32_t present_out = s->present_out;
-        if (s->stream.ids[CI_PRESENT] >= 0) {
-          uint32_t flush = (present_rows < s->chunk.num_rows) ? 0 : 7;
-          nrows_out      = (nrows_out + flush) >> 3;
-          nrows_out =
-            ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, present_out >> 3, nrows_out, flush, t) * 8;
-        }
-        __syncthreads();
-        if (!t) { s->present_out = min(present_out + nrows_out, present_rows); }
-      }
-      __syncthreads();
-    }
     // Fetch non-null values
     if (s->chunk.type_kind != LIST && !s->stream.data_ptrs[CI_DATA]) {
       // Pass-through
       __syncthreads();
       if (!t) {
-        s->cur_row           = s->present_rows;
-        s->strm_pos[CI_DATA] = s->cur_row * s->chunk.dtype_len;
+        s->cur_row           = s->chunk.num_rows;
+        s->strm_pos[CI_DATA] = s->chunk.num_rows * s->chunk.dtype_len;
       }
-      __syncthreads();
-    } else if (s->cur_row < s->present_rows) {
+    } else if (s->cur_row < s->chunk.num_rows) {
       uint32_t maxnumvals = (s->chunk.type_kind == BOOLEAN) ? 2048 : 1024;
       uint32_t nrows =
-        min(min(s->present_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)),
+        min(min(s->chunk.num_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)),
             encode_block_size);
-      auto const row_in_group = s->cur_row + t;
-      uint32_t const valid    = (t < nrows) ? validity(row_in_group) : 0;
-      s->buf.u32[t]           = valid;
+      auto const row = s->chunk.start_row + s->cur_row + t;
+
+      auto const is_value_valid = [&]() {
+        if (t >= nrows) return false;
+        return bit_value_or(pushdown_mask, column.offset() + row, true) and
+               bit_value_or(column.null_mask(), column.offset() + row, true);
+      }();
+      s->buf.u32[t] = is_value_valid ? 1u : 0u;
 
       // TODO: Could use a faster reduction relying on _popc() for the initial phase
       lengths_to_positions(s->buf.u32, encode_block_size, t);
       __syncthreads();
-      auto const row = s->chunk.start_row + row_in_group;
-      if (valid) {
+      if (is_value_valid) {
         int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
         switch (s->chunk.type_kind) {
           case INT:
           case DATE:
-          case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint32_t>(row); break;
+          case FLOAT: s->vals.u32[nz_idx] = column.element<uint32_t>(row); break;
           case DOUBLE:
-          case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element<uint64_t>(row); break;
-          case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint16_t>(row); break;
+          case LONG: s->vals.u64[nz_idx] = column.element<uint64_t>(row); break;
+          case SHORT: s->vals.u32[nz_idx] = column.element<uint16_t>(row); break;
           case BOOLEAN:
-          case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element<uint8_t>(row); break;
+          case BYTE: s->vals.u8[nz_idx] = column.element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = s->chunk.leaf_column->element<int64_t>(row);
+            int64_t ts       = column.element<int64_t>(row);
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
@@ -793,7 +844,7 @@ __global__ void __launch_bounds__(block_size)
               }
               s->vals.u32[nz_idx] = dict_idx;
             } else {
-              string_view value = s->chunk.leaf_column->element<string_view>(row);
+              string_view value                       = column.element<string_view>(row);
               s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data();
               s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
@@ -802,11 +853,10 @@ __global__ void __launch_bounds__(block_size)
             // Note: can be written in a faster manner, given that all values are equal
           case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break;
           case LIST: {
-            auto const& offsets =
-              s->chunk.leaf_column->child(lists_column_view::offsets_column_index);
+            auto const& offsets = column.child(lists_column_view::offsets_column_index);
             // Compute list length from the offsets
-            s->lengths.u32[nz_idx] =
-              offsets.element<size_type>(row + 1) - offsets.element<size_type>(row);
+            s->lengths.u32[nz_idx] = offsets.element<size_type>(row + 1 + column.offset()) -
+                                     offsets.element<size_type>(row + column.offset());
           } break;
           default: break;
         }
@@ -894,10 +944,10 @@ __global__ void __launch_bounds__(block_size)
             }
             break;
           case DECIMAL: {
-            if (valid) {
-              uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32)
-                                        ? zigzag(s->chunk.leaf_column->element<int32_t>(row))
-                                        : zigzag(s->chunk.leaf_column->element<int64_t>(row));
+            if (is_value_valid) {
+              uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32)
+                                        ? zigzag(column.element<int32_t>(row))
+                                        : zigzag(column.element<int64_t>(row));
               auto const offset =
                 (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
               StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
@@ -939,8 +989,8 @@ __global__ void __launch_bounds__(block_size)
       streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t*>(const_cast<void*>(s->chunk.leaf_column->head())) +
-        (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
+        static_cast<uint8_t*>(const_cast<void*>(column.head())) +
+        (column.offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
 }
@@ -1030,16 +1080,6 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; }
 }
 
-__global__ void __launch_bounds__(512)
-  gpu_set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                        device_2dspan<EncChunk> chunks)
-{
-  // Set leaf_column member of EncChunk
-  for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) {
-    chunks[blockIdx.x][i].leaf_column = &orc_columns[blockIdx.x].cudf_column;
-  }
-}
-
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
@@ -1102,15 +1142,17 @@ __global__ void __launch_bounds__(1024)
  * @param[out] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {256,1,1}
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           gpu_inflate_input_s* comp_in,
-                           gpu_inflate_status_s* comp_out,
+                           device_span<gpu_inflate_input_s> comp_in,
+                           device_span<gpu_inflate_status_s> comp_out,
                            uint8_t* compressed_bfr,
-                           uint32_t comp_blk_size)
+                           uint32_t comp_blk_size,
+                           uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ uint8_t* volatile uncomp_base_g;
@@ -1135,8 +1177,8 @@ __global__ void __launch_bounds__(256)
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
     blk_in->srcDevice = src + b * comp_blk_size;
     blk_in->srcSize   = blk_size;
-    blk_in->dstDevice = dst + b * (3 + comp_blk_size) + 3;  // reserve 3 bytes for block header
-    blk_in->dstSize   = blk_size;
+    blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE;
+    blk_in->dstSize   = max_comp_blk_size;
     blk_out->bytes_written = blk_size;
     blk_out->status        = 1;
     blk_out->reserved      = 0;
@@ -1153,14 +1195,16 @@ __global__ void __launch_bounds__(256)
  * @param[in] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             gpu_inflate_input_s* comp_in,
-                             gpu_inflate_status_s* comp_out,
+                             device_span<gpu_inflate_input_s> comp_in,
+                             device_span<gpu_inflate_status_s> comp_out,
                              uint8_t* compressed_bfr,
-                             uint32_t comp_blk_size)
+                             uint32_t comp_blk_size,
+                             uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ const uint8_t* volatile comp_src_g;
@@ -1248,16 +1292,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
-void set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                       device_2dspan<EncChunk> chunks,
-                       rmm::cuda_stream_view stream)
-{
-  dim3 dim_block(512, 1);
-  dim3 dim_grid(chunks.size().first, 1);
-
-  gpu_set_chunk_columns<<<dim_grid, dim_block, 0, stream.value()>>>(orc_columns, chunks);
-}
-
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
@@ -1271,20 +1305,83 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
   gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
-    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size);
-  if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); }
+    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+  if (compression == SNAPPY) {
+    auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
+    bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
+    if (use_nvcomp) {
+      try {
+        size_t temp_size;
+        nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
+          num_compressed_blocks, comp_blk_size, nvcompBatchedSnappyDefaultOpts, &temp_size);
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+                     "Error in getting snappy compression scratch size");
+
+        rmm::device_buffer scratch(temp_size, stream);
+        rmm::device_uvector<void const*> uncompressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> uncompressed_data_sizes(num_compressed_blocks, stream);
+        rmm::device_uvector<void*> compressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> compressed_bytes_written(num_compressed_blocks, stream);
+
+        auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
+                                                 uncompressed_data_sizes.begin(),
+                                                 compressed_data_ptrs.begin());
+        thrust::transform(rmm::exec_policy(stream),
+                          comp_in.begin(),
+                          comp_in.end(),
+                          comp_it,
+                          [] __device__(gpu_inflate_input_s in) {
+                            return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
+                          });
+        nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
+                                                         uncompressed_data_sizes.data(),
+                                                         max_comp_blk_size,
+                                                         num_compressed_blocks,
+                                                         scratch.data(),
+                                                         scratch.size(),
+                                                         compressed_data_ptrs.data(),
+                                                         compressed_bytes_written.data(),
+                                                         nvcompBatchedSnappyDefaultOpts,
+                                                         stream.value());
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression");
+
+        thrust::transform(rmm::exec_policy(stream),
+                          compressed_bytes_written.begin(),
+                          compressed_bytes_written.end(),
+                          comp_out.begin(),
+                          [] __device__(size_t size) {
+                            gpu_inflate_status_s status{};
+                            status.bytes_written = size;
+                            return status;
+                          });
+      } catch (...) {
+        // If we reach this then there was an error in compressing so set an error status for each
+        // block
+        thrust::for_each(rmm::exec_policy(stream),
+                         comp_out.begin(),
+                         comp_out.end(),
+                         [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+      };
+
+    } else {
+      gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream);
+    }
+  }
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size);
+    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 94d8de6561b..be561530459 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -52,13 +53,13 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
     uint32_t max_uncompressed_block_size = 0;
     uint32_t num_compressed_blocks       = 0;
     uint32_t num_uncompressed_blocks     = 0;
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
       gpu_inflate_input_s* init_ctl = nullptr;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (block_len > block_size || cur + block_len > end) {
         // Fatal
         num_compressed_blocks       = 0;
@@ -145,12 +146,12 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     uint32_t num_compressed_blocks      = 0;
     uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
 
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size_est, uncompressed_size_actual;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (cur + block_len > end) { break; }
       if (is_uncompressed) {
         uncompressed_size_est    = block_len;
@@ -367,9 +368,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
       for (;;) {
         uint32_t block_len, is_uncompressed;
 
-        if (cur + 3 > end || cur + 3 >= start + compressed_offset) { break; }
+        if (cur + BLOCK_HEADER_SIZE > end || cur + BLOCK_HEADER_SIZE >= start + compressed_offset) {
+          break;
+        }
         block_len = cur[0] | (cur[1] << 8) | (cur[2] << 16);
-        cur += 3;
+        cur += BLOCK_HEADER_SIZE;
         is_uncompressed = block_len & 1;
         block_len >>= 1;
         cur += block_len;
@@ -471,6 +474,45 @@ extern "C" __global__ void __launch_bounds__(128, 8)
   }
 }
 
+template <int block_size>
+__global__ void __launch_bounds__(block_size)
+  gpu_reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
+                            device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                            device_2dspan<size_type> set_counts)
+{
+  typedef cub::BlockReduce<size_type, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  auto const column_id   = blockIdx.x;
+  auto const rowgroup_id = blockIdx.y;
+  auto const column      = orc_columns[column_id];
+  auto const t           = threadIdx.x;
+
+  auto const use_child_rg = column.type().id() == type_id::LIST;
+  auto const rg           = rowgroup_bounds[rowgroup_id][column_id + (use_child_rg ? 1 : 0)];
+
+  if (column.pushdown_mask == nullptr) {
+    // All elements are valid if the null mask is not present
+    if (t == 0) { set_counts[rowgroup_id][column_id] = rg.size(); }
+    return;
+  };
+
+  size_type count                          = 0;
+  static constexpr size_type bits_per_word = sizeof(bitmask_type) * 8;
+  for (auto row = t * bits_per_word + rg.begin; row < rg.end; row += block_size * bits_per_word) {
+    auto const begin_bit = row;
+    auto const end_bit   = min(static_cast<size_type>(row + bits_per_word), rg.end);
+    auto const mask_len  = end_bit - begin_bit;
+    auto const mask_word =
+      cudf::detail::get_mask_offset_word(column.pushdown_mask, 0, row, end_bit) &
+      ((1 << mask_len) - 1);
+    count += __popc(mask_word);
+  }
+
+  count = BlockReduce(temp_storage).Sum(count);
+  if (t == 0) { set_counts[rowgroup_id][column_id] = count; }
+}
+
 void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         int32_t num_streams,
                                         uint32_t compression_block_size,
@@ -493,19 +535,6 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                                                              num_streams);
 }
 
-/**
- * @brief Launches kernel for constructing rowgroup from index streams
- *
- * @param[out] row_groups RowGroup device array [rowgroup][column]
- * @param[in] strm_info List of compressed streams (or NULL if uncompressed)
- * @param[in] chunks ColumnDesc device array [stripe][column]
- * @param[in] num_columns Number of columns
- * @param[in] num_stripes Number of stripes
- * @param[in] num_rowgroups Number of row groups
- * @param[in] rowidx_stride Row index stride
- * @param[in] use_base_stride Whether to use base stride obtained from meta or the computed value
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- */
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,
                                  CompressedStreamInfo* strm_info,
                                  ColumnDesc* chunks,
@@ -528,6 +557,17 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups,
                                                                     use_base_stride);
 }
 
+void __host__ reduce_pushdown_masks(device_span<orc_column_device_view const> columns,
+                                    device_2dspan<rowgroup_rows const> rowgroups,
+                                    device_2dspan<cudf::size_type> valid_counts,
+                                    rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(128, 1);
+  dim3 dim_grid(columns.size(), rowgroups.size().first);  // 1 rowgroup per block
+  gpu_reduce_pushdown_masks<128>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(columns, rowgroups, valid_counts);
+}
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e0018ed7166..299c8fbb730 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -36,6 +36,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <nvcomp/snappy.h>
+
 #include <algorithm>
 #include <cstring>
 #include <numeric>
@@ -97,6 +99,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
     case cudf::type_id::DECIMAL32:
     case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
     case cudf::type_id::LIST: return TypeKind::LIST;
+    case cudf::type_id::STRUCT: return TypeKind::STRUCT;
     default: return TypeKind::INVALID_TYPE_KIND;
   }
 }
@@ -140,30 +143,30 @@ class orc_column_view {
    */
   explicit orc_column_view(uint32_t index,
                            int str_idx,
-                           int index_in_table,
+                           orc_column_view* parent,
                            column_view const& col,
-                           const table_metadata* metadata)
+                           column_in_metadata const& metadata)
     : cudf_column{col},
       _index{index},
       _str_idx{str_idx},
-      _is_child{index_in_table < 0},
+      _is_child{parent != nullptr},
       _type_width{cudf::is_fixed_width(col.type()) ? cudf::size_of(col.type()) : 0},
       _scale{(to_orc_type(col.type().id()) == TypeKind::DECIMAL) ? -col.type().scale()
                                                                  : to_clockscale(col.type().id())},
-      _precision{orc_precision(col.type().id())},
-      _type_kind{to_orc_type(col.type().id())}
+      _precision{metadata.is_decimal_precision_set() ? metadata.get_decimal_precision()
+                                                     : orc_precision(col.type().id())},
+      _type_kind{to_orc_type(col.type().id())},
+      name{metadata.get_name()}
   {
-    // Don't assign names to child columns
-    if (index_in_table >= 0) {
-      if (metadata != nullptr && index_in_table < static_cast<int>(metadata->column_names.size())) {
-        _name = metadata->column_names[index_in_table];
-      } else {
-        // Generating default name if name isn't present in metadata
-        _name = "_col" + std::to_string(index_in_table);
-      }
+    if (metadata.is_nullability_defined()) { nullable_from_metadata = metadata.nullable(); }
+    if (parent != nullptr) {
+      parent->add_child(_index);
+      _parent_index = parent->index();
     }
   }
 
+  void add_child(uint32_t child_idx) { children.emplace_back(child_idx); }
+
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
   void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
   auto dict_stride() const noexcept { return _dict_stride; }
@@ -204,15 +207,22 @@ class orc_column_view {
   auto device_stripe_dict() const noexcept { return d_stripe_dict; }
 
   // Index in the table
-  auto index() const noexcept { return _index; }
+  uint32_t index() const noexcept { return _index; }
   // Id in the ORC file
   auto id() const noexcept { return _index + 1; }
+
   auto is_child() const noexcept { return _is_child; }
+  auto parent_index() const noexcept { return _parent_index.value(); }
+  auto child_begin() const noexcept { return children.cbegin(); }
+  auto child_end() const noexcept { return children.cend(); }
+
   auto type_width() const noexcept { return _type_width; }
   auto size() const noexcept { return cudf_column.size(); }
+
   auto null_count() const noexcept { return cudf_column.null_count(); }
   auto null_mask() const noexcept { return cudf_column.null_mask(); }
   bool nullable() const noexcept { return null_mask() != nullptr; }
+  auto user_defined_nullable() const noexcept { return nullable_from_metadata; }
 
   auto scale() const noexcept { return _scale; }
   auto precision() const noexcept { return _precision; }
@@ -220,7 +230,7 @@ class orc_column_view {
   void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; }
   auto orc_kind() const noexcept { return _type_kind; }
   auto orc_encoding() const noexcept { return _encoding_kind; }
-  auto orc_name() const noexcept { return _name; }
+  std::string_view orc_name() const noexcept { return name; }
 
  private:
   column_view cudf_column;
@@ -236,9 +246,9 @@ class orc_column_view {
   int32_t _precision = 0;
 
   // ORC-related members
-  std::string _name{};
-  TypeKind _type_kind;
-  ColumnEncodingKind _encoding_kind;
+  TypeKind _type_kind               = INVALID_TYPE_KIND;
+  ColumnEncodingKind _encoding_kind = INVALID_ENCODING_KIND;
+  std::string name;
 
   // String dictionary-related members
   size_t _dict_stride                        = 0;
@@ -250,6 +260,10 @@ class orc_column_view {
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
   uint32_t* d_decimal_offsets = nullptr;
+
+  std::optional<bool> nullable_from_metadata;
+  std::vector<uint32_t> children;
+  std::optional<uint32_t> _parent_index;
 };
 
 size_type orc_table_view::num_rows() const noexcept
@@ -474,11 +488,13 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
       if (single_write_mode) {
         return column.nullable();
       } else {
-        if (user_metadata_with_nullability.column_nullable.empty()) return true;
-        CUDF_EXPECTS(user_metadata_with_nullability.column_nullable.size() > column.index(),
-                     "When passing values in user_metadata_with_nullability, data for all columns "
-                     "must be specified");
-        return user_metadata_with_nullability.column_nullable[column.index()];
+        // For chunked write, when not provided nullability, we assume the worst case scenario
+        // that all columns are nullable.
+        auto const chunked_nullable = column.user_defined_nullable().value_or(true);
+        CUDF_EXPECTS(chunked_nullable or !column.nullable(),
+                     "Mismatch in metadata prescribed nullability and input column nullability. "
+                     "Metadata for nullable input column cannot prescribe nullability = false");
+        return chunked_nullable;
       }
     }();
 
@@ -592,6 +608,9 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
+      case TypeKind::STRUCT:
+        // Only has the present stream
+        break;
       default: CUDF_FAIL("Unsupported ORC type kind");
     }
   }
@@ -639,16 +658,161 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
   return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
 }
 
+std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
+  orc_table_view const& orc_table,
+  file_segmentation const& segmentation,
+  rmm::cuda_stream_view stream)
+{
+  if (segmentation.num_rowgroups() == 0) return {};
+
+  auto d_pd_set_counts_data = rmm::device_uvector<cudf::size_type>(
+    orc_table.num_columns() * segmentation.num_rowgroups(), stream);
+  auto const d_pd_set_counts = device_2dspan<cudf::size_type>{
+    d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()};
+  gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
+
+  auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
+    segmentation.num_rowgroups(), orc_table.num_columns(), stream);
+  CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
+                           segmentation.rowgroups.base_device_ptr(),
+                           aligned_rgs.count() * sizeof(rowgroup_rows),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
+
+  // One thread per column, per stripe
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    orc_table.num_columns() * segmentation.num_stripes(),
+    [columns = device_span<orc_column_device_view const>{orc_table.d_columns},
+     stripes = device_span<stripe_rowgroups const>{d_stripes},
+     d_pd_set_counts,
+     out_rowgroups = device_2dspan<rowgroup_rows>{aligned_rgs}] __device__(auto& idx) {
+      uint32_t const col_idx = idx / stripes.size();
+      // No alignment needed for root columns
+      if (not columns[col_idx].parent_index.has_value()) return;
+
+      auto const stripe_idx     = idx % stripes.size();
+      auto const stripe         = stripes[stripe_idx];
+      auto const parent_col_idx = columns[col_idx].parent_index.value();
+      auto const parent_column  = columns[parent_col_idx];
+      auto const stripe_end     = stripe.first + stripe.size;
+
+      auto seek_last_borrow_rg = [&](auto rg_idx, size_type& bits_to_borrow) {
+        auto curr         = rg_idx + 1;
+        auto curr_rg_size = [&]() {
+          return parent_column.pushdown_mask != nullptr ? d_pd_set_counts[curr][parent_col_idx]
+                                                        : out_rowgroups[curr][col_idx].size();
+        };
+        while (curr < stripe_end and curr_rg_size() <= bits_to_borrow) {
+          // All bits from rowgroup borrowed, make the rowgroup empty
+          out_rowgroups[curr][col_idx].begin = out_rowgroups[curr][col_idx].end;
+          bits_to_borrow -= curr_rg_size();
+          ++curr;
+        }
+        return curr;
+      };
+
+      int previously_borrowed = 0;
+      for (auto rg_idx = stripe.first; rg_idx + 1 < stripe_end; ++rg_idx) {
+        auto& rg = out_rowgroups[rg_idx][col_idx];
+
+        if (parent_column.pushdown_mask == nullptr) {
+          // No pushdown mask, all null mask bits will be encoded
+          // Align on rowgroup size (can be misaligned for list children)
+          if (rg.size() % 8) {
+            auto bits_to_borrow           = 8 - rg.size() % 8;
+            auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow);
+            if (last_borrow_rg_idx == stripe_end) {
+              // Didn't find enough bits to borrow, move the rowgroup end to the stripe end
+              rg.end = out_rowgroups[stripe_end - 1][col_idx].end;
+              // Done with this stripe
+              break;
+            }
+            auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx];
+            last_borrow_rg.begin += bits_to_borrow;
+            rg.end = last_borrow_rg.begin;
+            // Skip the rowgroups we emptied in the loop
+            rg_idx = last_borrow_rg_idx - 1;
+          }
+        } else {
+          // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded
+          // Use the number of set bits in pushdown mask as size
+          auto bits_to_borrow =
+            8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8;
+          if (bits_to_borrow == 0) {
+            // Didn't borrow any bits for this rowgroup
+            previously_borrowed = 0;
+            continue;
+          }
+
+          // Find rowgroup in which we finish the search for missing bits
+          auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow);
+          if (last_borrow_rg_idx == stripe_end) {
+            // Didn't find enough bits to borrow, move the rowgroup end to the stripe end
+            rg.end = out_rowgroups[stripe_end - 1][col_idx].end;
+            // Done with this stripe
+            break;
+          }
+
+          auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx];
+          // First row that does not need to be borrowed
+          auto borrow_end = last_borrow_rg.begin;
+
+          // Adjust the number of bits to borrow in the next iteration
+          previously_borrowed = bits_to_borrow;
+
+          // Find word in which we finish the search for missing bits (guaranteed to be available)
+          while (bits_to_borrow != 0) {
+            auto const mask = cudf::detail::get_mask_offset_word(
+              parent_column.pushdown_mask, 0, borrow_end, borrow_end + 32);
+            auto const valid_in_word = __popc(mask);
+
+            if (valid_in_word > bits_to_borrow) break;
+            bits_to_borrow -= valid_in_word;
+            borrow_end += 32;
+          }
+
+          // Find the last of the missing bits (guaranteed to be available)
+          while (bits_to_borrow != 0) {
+            if (bit_is_set(parent_column.pushdown_mask, borrow_end)) { --bits_to_borrow; };
+            ++borrow_end;
+          }
+
+          last_borrow_rg.begin = borrow_end;
+          rg.end               = borrow_end;
+          // Skip the rowgroups we emptied in the loop
+          rg_idx = last_borrow_rg_idx - 1;
+        }
+      }
+    });
+
+  aligned_rgs.device_to_host(stream, true);
+
+  std::vector<std::vector<rowgroup_rows>> h_aligned_rgs;
+  h_aligned_rgs.reserve(segmentation.num_rowgroups());
+  std::transform(thrust::make_counting_iterator(0ul),
+                 thrust::make_counting_iterator(segmentation.num_rowgroups()),
+                 std::back_inserter(h_aligned_rgs),
+                 [&](auto idx) -> std::vector<rowgroup_rows> {
+                   return {aligned_rgs[idx].begin(), aligned_rgs[idx].end()};
+                 });
+
+  return h_aligned_rgs;
+}
+
 struct segmented_valid_cnt_input {
   bitmask_type const* mask;
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
-                                          string_dictionaries&& dictionaries,
-                                          encoder_decimal_info&& dec_chunk_sizes,
-                                          file_segmentation const& segmentation,
-                                          orc_streams const& streams)
+encoded_data encode_columns(orc_table_view const& orc_table,
+                            string_dictionaries&& dictionaries,
+                            encoder_decimal_info&& dec_chunk_sizes,
+                            file_segmentation const& segmentation,
+                            orc_streams const& streams,
+                            rmm::cuda_stream_view stream)
 {
   auto const num_columns = orc_table.num_columns();
   hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
@@ -656,19 +820,22 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
     streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups());
   rmm::device_uvector<uint8_t> encoded_data(stream_offsets.data_size(), stream);
 
+  auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream);
+
   // Initialize column chunks' descriptions
   std::map<size_type, segmented_valid_cnt_input> validity_check_inputs;
 
   for (auto const& column : orc_table.columns) {
     for (auto const& stripe : segmentation.stripes) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
-        auto const rg_idx = *rg_idx_it;
-        auto& ck          = chunks[column.index()][rg_idx];
-
-        ck.start_row     = segmentation.rowgroups[rg_idx][column.index()].begin;
-        ck.num_rows      = segmentation.rowgroups[rg_idx][column.index()].size();
-        ck.encoding_kind = column.orc_encoding();
-        ck.type_kind     = column.orc_kind();
+        auto const rg_idx      = *rg_idx_it;
+        auto& ck               = chunks[column.index()][rg_idx];
+        ck.start_row           = segmentation.rowgroups[rg_idx][column.index()].begin;
+        ck.num_rows            = segmentation.rowgroups[rg_idx][column.index()].size();
+        ck.null_mask_start_row = aligned_rowgroups[rg_idx][column.index()].begin;
+        ck.null_mask_num_rows  = aligned_rowgroups[rg_idx][column.index()].size();
+        ck.encoding_kind       = column.orc_encoding();
+        ck.type_kind           = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
           ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
                             ? column.host_stripe_dict(stripe.id)->dict_index
@@ -682,6 +849,19 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
       }
     }
   }
+  chunks.host_to_device(stream);
+  // TODO (future): pass columns separately from chunks (to skip this step)
+  // and remove info from chunks that is common for the entire column
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0ul),
+    chunks.count(),
+    [chunks = device_2dspan<gpu::EncChunk>{chunks},
+     cols = device_span<orc_column_device_view const>{orc_table.d_columns}] __device__(auto& idx) {
+      auto const col_idx             = idx / chunks.size().second;
+      auto const rg_idx              = idx % chunks.size().second;
+      chunks[col_idx][rg_idx].column = &cols[col_idx];
+    });
 
   auto validity_check_indices = [&](size_t col_idx) {
     std::vector<size_type> indices;
@@ -787,12 +967,8 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
       }
     }
   }
-
-  chunks.host_to_device(stream);
   chunk_streams.host_to_device(stream);
 
-  gpu::set_chunk_columns(orc_table.d_columns, chunks, stream);
-
   if (orc_table.num_string_columns() != 0) {
     auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
     gpu::EncodeStripeDictionaries(d_stripe_dict,
@@ -854,11 +1030,10 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
                              device_span<stats_column_desc> stat_desc,
                              rmm::cuda_stream_view stream)
 {
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0ul),
-    thrust::make_counting_iterator(stat_desc.size()),
-    [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx].cudf_column; });
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0ul),
+                   thrust::make_counting_iterator(stat_desc.size()),
+                   [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; });
 }
 
 std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
@@ -999,10 +1174,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
              (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
-             (record.comp_pos + 3 + comp_out[record.blk_pos].bytes_written <
+             (record.comp_pos + BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
         record.pos -= compression_blocksize_;
-        record.comp_pos += 3 + comp_out[record.blk_pos].bytes_written;
+        record.comp_pos += BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
     }
@@ -1099,14 +1274,16 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : compression_kind_(to_orc_compression(options.get_compression())),
+  : _mr(mr),
+    stream(stream),
+    compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
-    out_sink_(std::move(sink)),
     single_write_mode(mode == SingleWriteMode::YES),
-    user_metadata(options.get_metadata()),
-    stream(stream),
-    _mr(mr)
+    out_sink_(std::move(sink))
 {
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
   init_state();
 }
 
@@ -1115,18 +1292,16 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : compression_kind_(to_orc_compression(options.get_compression())),
+  : _mr(mr),
+    stream(stream),
+    compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
-    out_sink_(std::move(sink)),
     single_write_mode(mode == SingleWriteMode::YES),
-    stream(stream),
-    _mr(mr)
+    out_sink_(std::move(sink))
 {
-  if (options.get_metadata() != nullptr) {
-    user_metadata_with_nullability = *options.get_metadata();
-    user_metadata                  = &user_metadata_with_nullability;
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-
   init_state();
 }
 
@@ -1138,6 +1313,113 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
+void pushdown_lists_null_mask(orc_column_view const& col,
+                              device_span<orc_column_device_view> d_columns,
+                              bitmask_type const* parent_pd_mask,
+                              device_span<bitmask_type> out_mask,
+                              rmm::cuda_stream_view stream)
+{
+  // Set all bits - correct unless there's a mismatch between offsets and null mask
+  CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
+                           255,
+                           out_mask.size() * sizeof(bitmask_type),
+                           stream.value()));
+
+  // Reset bits where a null list element has rows in the child column
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0u),
+    col.size(),
+    [d_columns, col_idx = col.index(), parent_pd_mask, out_mask] __device__(auto& idx) {
+      auto const d_col        = d_columns[col_idx];
+      auto const is_row_valid = d_col.is_valid(idx) and bit_value_or(parent_pd_mask, idx, true);
+      if (not is_row_valid) {
+        auto offsets                = d_col.child(lists_column_view::offsets_column_index);
+        auto const child_rows_begin = offsets.element<size_type>(idx + d_col.offset());
+        auto const child_rows_end   = offsets.element<size_type>(idx + 1 + d_col.offset());
+        for (auto child_row = child_rows_begin; child_row < child_rows_end; ++child_row)
+          clear_bit(out_mask.data(), child_row);
+      }
+    });
+}
+
+/**
+ * @brief All pushdown masks in a table.
+ *
+ * Pushdown masks are applied to child column(s). Only bits of the child column null mask that
+ * correspond to set pushdown mask bits are encoded into the output file. Similarly, rows where
+ * pushdown mask is 0 are treated as invalid and not included in the output.
+ */
+struct pushdown_null_masks {
+  // Owning vector for masks in device memory
+  std::vector<rmm::device_uvector<bitmask_type>> data;
+  // Pointers to pushdown masks in device memory. Can be same for multiple columns.
+  std::vector<bitmask_type const*> masks;
+};
+
+pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
+                                             rmm::cuda_stream_view stream)
+{
+  std::vector<bitmask_type const*> mask_ptrs;
+  mask_ptrs.reserve(orc_table.num_columns());
+  std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
+  for (auto const& col : orc_table.columns) {
+    // Leaf columns don't need pushdown masks
+    if (col.orc_kind() != LIST && col.orc_kind() != STRUCT) {
+      mask_ptrs.emplace_back(nullptr);
+      continue;
+    }
+    auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
+    auto const null_mask      = col.null_mask();
+
+    if (null_mask == nullptr and parent_pd_mask == nullptr) {
+      mask_ptrs.emplace_back(nullptr);
+      continue;
+    }
+    if (col.orc_kind() == STRUCT) {
+      if (null_mask != nullptr and parent_pd_mask == nullptr) {
+        // Reuse own null mask
+        mask_ptrs.emplace_back(null_mask);
+      } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
+        // Reuse parent's pushdown mask
+        mask_ptrs.emplace_back(parent_pd_mask);
+      } else {
+        // Both are nullable, allocate new pushdown mask
+        pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
+        mask_ptrs.emplace_back(pd_masks.back().data());
+
+        thrust::transform(rmm::exec_policy(stream),
+                          null_mask,
+                          null_mask + pd_masks.back().size(),
+                          parent_pd_mask,
+                          pd_masks.back().data(),
+                          thrust::bit_and<bitmask_type>());
+      }
+    }
+    if (col.orc_kind() == LIST) {
+      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      auto const child_col = orc_table.column(col.child_begin()[0]);
+      // pushdown mask applies to child column; use the child column size
+      pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
+      mask_ptrs.emplace_back(pd_masks.back().data());
+      pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
+    }
+  }
+
+  // Attach null masks to device column views (async)
+  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream);
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0ul),
+    orc_table.num_columns(),
+    [cols = device_span<orc_column_device_view>{orc_table.d_columns},
+     ptrs = device_span<bitmask_type const* const>{d_mask_ptrs}] __device__(auto& idx) {
+      cols[idx].pushdown_mask = ptrs[idx];
+    });
+
+  return {std::move(pd_masks), std::move(mask_ptrs)};
+}
+
 template <typename T>
 struct device_stack {
   __device__ device_stack(T* stack_storage, int capacity)
@@ -1164,28 +1446,35 @@ struct device_stack {
 
 orc_table_view make_orc_table_view(table_view const& table,
                                    table_device_view const& d_table,
-                                   table_metadata const* user_metadata,
+                                   table_input_metadata const& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   std::vector<orc_column_view> orc_columns;
   std::vector<uint32_t> str_col_indexes;
 
-  std::function<void(column_view const&, int)> append_orc_column = [&](column_view const& col,
-                                                                       int index_in_table) {
-    int const str_idx =
-      (col.type().id() == type_id::STRING) ? static_cast<int>(str_col_indexes.size()) : -1;
-    auto const& new_col =
-      orc_columns.emplace_back(orc_columns.size(), str_idx, index_in_table, col, user_metadata);
-    if (new_col.is_string()) { str_col_indexes.push_back(new_col.index()); }
-    if (col.type().id() == type_id::LIST)
-      append_orc_column(col.child(lists_column_view::child_column_index), -1);
-    if (col.type().id() == type_id::STRUCT)
-      for (auto child = col.child_begin(); child != col.child_end(); ++child)
-        append_orc_column(*child, -1);
-  };
+  std::function<void(column_view const&, orc_column_view*, column_in_metadata const&)>
+    append_orc_column =
+      [&](column_view const& col, orc_column_view* parent_col, column_in_metadata const& col_meta) {
+        int const str_idx =
+          (col.type().id() == type_id::STRING) ? static_cast<int>(str_col_indexes.size()) : -1;
+
+        auto const new_col_idx = orc_columns.size();
+        orc_columns.emplace_back(new_col_idx, str_idx, parent_col, col, col_meta);
+        if (orc_columns[new_col_idx].is_string()) { str_col_indexes.push_back(new_col_idx); }
+
+        if (col.type().id() == type_id::LIST) {
+          append_orc_column(col.child(lists_column_view::child_column_index),
+                            &orc_columns[new_col_idx],
+                            col_meta.child(lists_column_view::child_column_index));
+        } else if (col.type().id() == type_id::STRUCT) {
+          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx)
+            append_orc_column(
+              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
+        }
+      };
 
   for (auto col_idx = 0; col_idx < table.num_columns(); ++col_idx) {
-    append_orc_column(table.column(col_idx), col_idx);
+    append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
@@ -1254,19 +1543,24 @@ hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view cons
           // Root column
           if (!col.parent_index.has_value()) {
             size_type const rows_begin = rg_idx * rowgroup_size;
-            auto const rows_end =
-              thrust::min<size_type>((rg_idx + 1) * rowgroup_size, col.cudf_column.size());
+            auto const rows_end = thrust::min<size_type>((rg_idx + 1) * rowgroup_size, col.size());
             return rowgroup_rows{rows_begin, rows_end};
           } else {
             // Child column
-            auto const parent_index       = *col.parent_index;
-            column_device_view parent_col = cols[parent_index].cudf_column;
-            if (parent_col.type().id() != type_id::LIST) return rg_bounds[rg_idx][parent_index];
-
-            auto parent_offsets = parent_col.child(lists_column_view::offsets_column_index);
-            auto const& parent_rowgroup_rows = rg_bounds[rg_idx][parent_index];
-            auto const rows_begin = parent_offsets.element<size_type>(parent_rowgroup_rows.begin);
-            auto const rows_end   = parent_offsets.element<size_type>(parent_rowgroup_rows.end);
+            auto const parent_index           = *col.parent_index;
+            orc_column_device_view parent_col = cols[parent_index];
+            auto const parent_rg              = rg_bounds[rg_idx][parent_index];
+            if (parent_col.type().id() != type_id::LIST) {
+              auto const offset_diff = parent_col.offset() - col.offset();
+              return rowgroup_rows{parent_rg.begin + offset_diff, parent_rg.end + offset_diff};
+            }
+
+            auto offsets = parent_col.child(lists_column_view::offsets_column_index);
+            auto const rows_begin =
+              offsets.element<size_type>(parent_rg.begin + parent_col.offset()) - col.offset();
+            auto const rows_end =
+              offsets.element<size_type>(parent_rg.end + parent_col.offset()) - col.offset();
+
             return rowgroup_rows{rows_begin, rows_end};
           }
         });
@@ -1293,8 +1587,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                        current_sizes.end(),
                        [d_cols  = device_span<orc_column_device_view const>{orc_table.d_columns},
                         col_idx = orc_col.index()] __device__(auto idx) {
-                         auto const& col = d_cols[col_idx].cudf_column;
-                         if (col.is_null(idx)) return 0u;
+                         auto const& col          = d_cols[col_idx];
+                         auto const pushdown_mask = [&]() -> cudf::bitmask_type const* {
+                           auto const parent_index = d_cols[col_idx].parent_index;
+                           if (!parent_index.has_value()) return nullptr;
+                           return d_cols[parent_index.value()].pushdown_mask;
+                         }();
+                         if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
+                           return 0u;
                          int64_t const element   = (col.type().id() == type_id::DECIMAL32)
                                                      ? col.element<int32_t>(idx)
                                                      : col.element<int64_t>(idx);
@@ -1416,9 +1716,25 @@ void writer::impl::write(table_view const& table)
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
   auto const num_rows = table.num_rows();
 
+  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
+
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
+    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  }
+
   auto const d_table = table_device_view::create(table, stream);
 
-  auto orc_table = make_orc_table_view(table, *d_table, user_metadata, stream);
+  auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream);
+
+  auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
   auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride_, stream);
 
@@ -1456,7 +1772,7 @@ void writer::impl::write(table_view const& table)
   auto streams =
     create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
   auto enc_data = encode_columns(
-    orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams);
+    orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream);
 
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
@@ -1472,29 +1788,31 @@ void writer::impl::write(table_view const& table)
   }
 
   // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size   = 0;
-  size_t num_compressed_blocks = 0;
-  auto stream_output           = [&]() {
+  size_t compressed_bfr_size       = 0;
+  size_t num_compressed_blocks     = 0;
+  size_t max_compressed_block_size = 0;
+  if (compression_kind_ != NONE) {
+    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+  }
+  auto stream_output = [&]() {
     size_t max_stream_size = 0;
     bool all_device_write  = true;
 
-    for (size_t stripe_id = 0; stripe_id < segmentation.num_stripes(); stripe_id++) {
-      for (size_t i = 0; i < num_data_streams; i++) {  // TODO range for (at least)
-        gpu::StripeStream* ss = &strm_descs[stripe_id][i];
-        if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
-        size_t stream_size = ss->stream_size;
-        if (compression_kind_ != NONE) {
-          ss->first_block = num_compressed_blocks;
-          ss->bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * 3;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size += stream_size;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind_ != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks = std::max<uint32_t>(
+          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+        stream_size += num_blocks * BLOCK_HEADER_SIZE;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
       }
+      max_stream_size = std::max(max_stream_size, stream_size);
     }
 
     if (all_device_write) {
@@ -1519,10 +1837,11 @@ void writer::impl::write(table_view const& table)
                                 num_compressed_blocks,
                                 compression_kind_,
                                 compression_blocksize_,
+                                max_compressed_block_size,
                                 strm_descs,
                                 enc_data.streams,
-                                comp_in.device_ptr(),
-                                comp_out.device_ptr(),
+                                comp_in,
+                                comp_out,
                                 stream);
     strm_descs.device_to_host(stream);
     comp_out.device_to_host(stream, true);
@@ -1641,6 +1960,18 @@ void writer::impl::write(table_view const& table)
       }
       // In preorder traversal the column after a list column is always the child column
       if (column.orc_kind() == LIST) { schema_type.subtypes.emplace_back(column.id() + 1); }
+      if (column.orc_kind() == STRUCT) {
+        std::transform(column.child_begin(),
+                       column.child_end(),
+                       std::back_inserter(schema_type.subtypes),
+                       [&](auto const& child_idx) { return orc_table.column(child_idx).id(); });
+        std::transform(column.child_begin(),
+                       column.child_end(),
+                       std::back_inserter(schema_type.fieldNames),
+                       [&](auto const& child_idx) {
+                         return std::string{orc_table.column(child_idx).orc_name()};
+                       });
+      }
     }
   } else {
     // verify the user isn't passing mismatched tables
@@ -1666,11 +1997,13 @@ void writer::impl::close()
   PostScript ps;
 
   ff.contentLength = out_sink_->bytes_written();
-  if (user_metadata) {
-    for (auto it = user_metadata->user_data.begin(); it != user_metadata->user_data.end(); it++) {
-      ff.metadata.push_back({it->first, it->second});
-    }
-  }
+  std::transform(table_meta->user_data.begin(),
+                 table_meta->user_data.end(),
+                 std::back_inserter(ff.metadata),
+                 [&](auto const& udata) {
+                   return UserMetadataItem{udata.first, udata.second};
+                 });
+
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
     buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 787bdeb3a4e..a8fe22a360f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -262,23 +262,6 @@ class writer::impl {
                              file_segmentation const& segmentation,
                              std::map<uint32_t, size_t> const& decimal_column_sizes);
 
-  /**
-   * @brief Encodes the input columns into streams.
-   *
-   * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
-   * @param dict_data Dictionary data memory
-   * @param dict_index Dictionary index memory
-   * @param dec_chunk_sizes Information about size of encoded decimal columns
-   * @param segmentation stripe and rowgroup ranges
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Encoded data and per-chunk stream descriptors
-   */
-  encoded_data encode_columns(orc_table_view const& orc_table,
-                              string_dictionaries&& dictionaries,
-                              encoder_decimal_info&& dec_chunk_sizes,
-                              file_segmentation const& segmentation,
-                              orc_streams const& streams);
-
   /**
    * @brief Returns stripe information after compacting columns' individual data
    * chunks into contiguous data streams.
@@ -375,14 +358,11 @@ class writer::impl {
   cudf::io::orc::Metadata md;
   // current write position for rowgroups/chunks
   size_t current_chunk_offset;
-  // optional user metadata
-  table_metadata const* user_metadata = nullptr;
-  // only used in the write_chunked() case. copied from the (optionally) user supplied
-  // argument to write_chunked_begin()
-  table_metadata_with_nullability user_metadata_with_nullability;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
   // a single table write.  this enables some internal optimizations.
   bool const single_write_mode;
+  // optional user metadata
+  std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
 
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 564d919b65d..e53ae4ff0c1 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <structs/utilities.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
@@ -22,6 +24,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/detail/sorting.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,10 +40,15 @@ namespace lists {
 namespace detail {
 namespace {
 template <typename Type>
-struct has_negative_nans {
+struct has_negative_nans_fn {
   column_device_view const d_entries;
   bool const has_nulls;
 
+  has_negative_nans_fn(column_device_view const d_entries, bool const has_nulls)
+    : d_entries(d_entries), has_nulls(has_nulls)
+  {
+  }
+
   __device__ Type operator()(size_type idx) const noexcept
   {
     if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; }
@@ -50,30 +59,53 @@ struct has_negative_nans {
 };
 
 /**
- * @brief A structure to be used along with type_dispatcher to check if a
- * `column_view` has any negative NaN entry
+ * @brief A structure to be used along with type_dispatcher to check if a column has any
+ * negative NaN value.
+ *
+ * This functor is used to check for replacing negative NaN if there exists one. It is neccessary
+ * because when calling to `lists::detail::sort_lists`, the negative NaN and positive NaN values (if
+ * both exist) are separated to the two ends of the output column. This is due to the API
+ * `lists::detail::sort_lists` internally calls `cub::DeviceSegmentedRadixSort`, which performs
+ * sorting by comparing bits of the input numbers. Since negative and positive NaN have
+ * different bits representation, they may not be moved to be close to each other after sorted.
  */
-struct has_negative_nans_fn {
+struct has_negative_nans_dispatch {
   template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
   bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept
   {
     auto const d_entries = column_device_view::create(lists_entries, stream);
-    return thrust::count_if(rmm::exec_policy(stream),
-                            thrust::make_counting_iterator(0),
-                            thrust::make_counting_iterator(lists_entries.size()),
-                            detail::has_negative_nans<Type>{*d_entries, lists_entries.has_nulls()});
+    return thrust::count_if(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.size()),
+      detail::has_negative_nans_fn<Type>{*d_entries, lists_entries.has_nulls()});
   }
 
-  template <typename Type, std::enable_if_t<not cuda::std::is_floating_point_v<Type>>* = nullptr>
-  bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept
+  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const
   {
-    // Columns of non floating-point data will never contain NaN
+    // Recursively check negative NaN on the children columns.
+    return std::any_of(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.num_children()),
+      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
+        auto const col = structs_view.get_sliced_child(child_idx);
+        return type_dispatcher(col.type(), detail::has_negative_nans_dispatch{}, col, stream);
+      });
+  }
+
+  template <typename Type,
+            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  bool operator()(column_view const&, rmm::cuda_stream_view) const
+  {
+    // Columns of non floating-point data will never contain NaN.
     return false;
   }
 };
 
 template <typename Type>
-struct replace_negative_nans {
+struct replace_negative_nans_fn {
   __device__ Type operator()(Type val) const noexcept
   {
     return std::isnan(val) ? std::numeric_limits<Type>::quiet_NaN() : val;
@@ -81,58 +113,63 @@ struct replace_negative_nans {
 };
 
 /**
- * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries
- * of a floating-point data column
+ * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all rows
+ * in a floating-point data column.
  */
-struct replace_negative_nans_fn {
-  template <typename Type, std::enable_if_t<not cuda::std::is_floating_point_v<Type>>* = nullptr>
-  void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const
+struct replace_negative_nans_dispatch {
+  template <typename Type,
+            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view) const noexcept
   {
-    CUDF_FAIL("Cannot operate on a type that is not floating-point.");
+    // For non floating point type and non struct, just return a copy of the input.
+    return std::make_unique<column>(lists_entries);
   }
 
   template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
-  void operator()(column_view const& lists_entries,
-                  mutable_column_view const& new_entries,
-                  rmm::cuda_stream_view stream) const noexcept
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view stream) const noexcept
   {
-    // Do not care whether an entry is null or not, just consider it as a floating-point value
-    thrust::transform(rmm::exec_policy(stream),
-                      lists_entries.begin<Type>(),
-                      lists_entries.end<Type>(),
-                      new_entries.begin<Type>(),
-                      detail::replace_negative_nans<Type>{});
-  }
-};
+    auto new_entries = cudf::detail::allocate_like(
+      lists_entries, lists_entries.size(), cudf::mask_allocation_policy::NEVER, stream);
+    new_entries->set_null_mask(cudf::detail::copy_bitmask(lists_entries, stream),
+                               lists_entries.null_count());
 
-/**
- * @brief Transform a given lists column to a new lists column in which all the list entries holding
- * -NaN value are replaced by (positive) NaN
- */
-std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_entries,
-                                                      lists_column_view const& lists_column,
-                                                      rmm::cuda_stream_view stream)
-{
-  auto new_offsets = std::make_unique<column>(lists_column.offsets());
-  auto new_entries = std::make_unique<column>(lists_entries);
+    // Replace all negative NaN values.
+    thrust::transform(rmm::exec_policy(stream),
+                      lists_entries.template begin<Type>(),
+                      lists_entries.template end<Type>(),
+                      new_entries->mutable_view().template begin<Type>(),
+                      detail::replace_negative_nans_fn<Type>{});
 
-  type_dispatcher(lists_entries.type(),
-                  detail::replace_negative_nans_fn{},
-                  lists_entries,
-                  new_entries->mutable_view(),
-                  stream);
+    return new_entries;
+  }
 
-  return make_lists_column(
-    lists_column.size(),
-    std::move(new_offsets),
-    std::move(new_entries),
-    lists_column.null_count(),
-    cudf::detail::copy_bitmask(
-      lists_column.parent(), stream, rmm::mr::get_current_device_resource()));
-}
+  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view stream) const noexcept
+  {
+    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
+    std::transform(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.num_children()),
+      std::back_inserter(output_struct_members),
+      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
+        auto const col = structs_view.get_sliced_child(child_idx);
+        return type_dispatcher(col.type(), detail::replace_negative_nans_dispatch{}, col, stream);
+      });
+
+    return cudf::make_structs_column(lists_entries.size(),
+                                     std::move(output_struct_members),
+                                     lists_entries.null_count(),
+                                     cudf::detail::copy_bitmask(lists_entries, stream),
+                                     stream);
+  }
+};
 
 /**
- * @brief Generate a 0-based offset column for a lists column
+ * @brief Generate a 0-based offset column for a lists column.
  *
  * Given a lists_column_view, which may have a non-zero offset, generate a new column containing
  * 0-based list offsets. This is done by subtracting each of the input list offset by the first
@@ -143,11 +180,10 @@ std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_e
  * then output_offsets = { 0, 4, 6, 10 }
  * @endcode
  *
- * @param lists_column The input lists column
- * @param stream       CUDA stream used for device memory operations and kernel launches
- * @param mr           Device resource used to allocate memory
- *
- * @return A column containing 0-based list offsets
+ * @param lists_column The input lists column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
+ * @return A column containing 0-based list offsets.
  */
 std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_column,
                                                rmm::cuda_stream_view stream,
@@ -168,7 +204,35 @@ std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_co
 }
 
 /**
- * @brief Populate list offsets for all list entries
+ * @brief Transform a given lists column to a new lists column in which all the list entries holding
+ * -NaN value are replaced by (positive) NaN.
+ *
+ * Replacing -NaN by NaN is necessary before sorting (individual) lists because the sorting API is
+ * using radix sort, which compares bits of the number thus it may separate -NaN by NaN to the two
+ * ends of the result column.
+ */
+std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_entries,
+                                                      lists_column_view const& lists_column,
+                                                      rmm::cuda_stream_view stream)
+{
+  // We need to copy the offsets column of the input lists_column. Since the input lists_column may
+  // be sliced, we need to generate clean offsets (i.e., offsets starting from zero).
+  auto new_offsets =
+    generate_clean_offsets(lists_column, stream, rmm::mr::get_current_device_resource());
+  auto new_entries = type_dispatcher(
+    lists_entries.type(), detail::replace_negative_nans_dispatch{}, lists_entries, stream);
+
+  return make_lists_column(
+    lists_column.size(),
+    std::move(new_offsets),
+    std::move(new_entries),
+    lists_column.null_count(),
+    cudf::detail::copy_bitmask(
+      lists_column.parent(), stream, rmm::mr::get_current_device_resource()));
+}
+
+/**
+ * @brief Populate list offsets for all list entries.
  *
  * Given an `offsets` column_view containing offsets of a lists column and a number of all list
  * entries in the column, generate an array that maps from each list entry to the offset of the list
@@ -179,12 +243,11 @@ std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_co
  * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 }
  * @endcode
  *
- * @param num_entries The number of list entries
- * @param offsets     Column view to the list offsets
- * @param stream      CUDA stream used for device memory operations and kernel launches
- * @param mr          Device resource used to allocate memory
- *
- * @return A column containing entry list offsets
+ * @param num_entries The number of list entries.
+ * @param offsets Column view to the list offsets.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
+ * @return A column containing entry list offsets.
  */
 std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
                                                     column_view const& offsets,
@@ -205,95 +268,172 @@ std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
 }
 
 /**
- * @brief Performs an equality comparison between two entries in a lists column
+ * @brief Performs an equality comparison between two entries in a lists column.
  *
- * For the two elements that are in the same list in the lists column, they will always be
- * considered as different. If they are from the same list and their type is one of floating
- * point types, this functor will return the same comparison result as
- * `cudf::element_equality_comparator`.
+ * For the two elements that are NOT in the same list in the lists column, they will always be
+ * considered as different. If they are from the same list and their type is not floating point,
+ * this functor will return the same comparison result as `cudf::element_equality_comparator`.
  *
  * For floating-point types, entries holding NaN value can be considered as different values or the
- * same value depending on the nans_equal parameter.
+ * same value depending on the `nans_equal` parameter.
  *
- * @tparam Type       The data type of entries
+ * @tparam Type The data type of entries
  * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only
  * applicable for floating-point data column)
  */
-template <class Type, bool nans_equal>
-class list_entry_comparator {
- public:
-  list_entry_comparator(offset_type const* list_offsets,
-                        column_device_view d_view,
-                        null_equality nulls_equal,
-                        bool has_nulls)
-    : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls)
+template <class Type>
+struct column_row_comparator_fn {
+  offset_type const* const list_offsets;
+  column_device_view const lhs;
+  column_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  __host__ __device__ column_row_comparator_fn(offset_type const* const list_offsets,
+                                               column_device_view const& lhs,
+                                               column_device_view const& rhs,
+                                               null_equality const nulls_equal,
+                                               bool const has_nulls,
+                                               bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
   {
   }
 
-  template <bool nans_equal_ = nans_equal>
-  std::enable_if_t<cuda::std::is_floating_point_v<Type> and nans_equal_, bool> __device__
-  operator()(size_type i, size_type j) const noexcept
+  template <typename T, std::enable_if_t<!cuda::std::is_floating_point_v<T>>* = nullptr>
+  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
   {
-    // Two entries are not considered for equality if they belong to different lists
-    if (list_offsets[i] != list_offsets[j]) { return false; }
+    return lhs_val == rhs_val;
+  }
 
-    if (has_nulls) {
-      bool const nullable = d_view.nullable();
-      bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)};
-      bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)};
-      if (lhs_is_null and rhs_is_null) {
-        return nulls_equal == null_equality::EQUAL;
-      } else if (lhs_is_null != rhs_is_null) {
-        return false;
-      }
-    }
+  template <typename T, std::enable_if_t<cuda::std::is_floating_point_v<T>>* = nullptr>
+  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
+  {
+    // If both element(i) and element(j) are NaNs and nans are considered as equal value then this
+    // comparison will return `true`. This is the desired behavior in Pandas.
+    if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; }
 
-    // For floating-point types, if both element(i) and element(j) are NaNs then this comparison
-    // will return `true`. This is the desired behavior in Pandas.
-    auto const lhs = d_view.element<Type>(i);
-    auto const rhs = d_view.element<Type>(j);
-    if (std::isnan(lhs) and std::isnan(rhs)) { return true; }
-    return lhs == rhs;
+    // If nans are considered as NOT equal, even both element(i) and element(j) are NaNs this
+    // comparison will still return `false`. This is the desired behavior in Apache Spark.
+    return lhs_val == rhs_val;
   }
 
-  template <bool nans_equal_ = nans_equal>
-  std::enable_if_t<not cuda::std::is_floating_point_v<Type> or not nans_equal_, bool> __device__
-  operator()(size_type i, size_type j) const noexcept
+  bool __device__ operator()(size_type i, size_type j) const noexcept
   {
-    // Two entries are not considered for equality if they belong to different lists
+    // Two entries are not considered for equality if they belong to different lists.
     if (list_offsets[i] != list_offsets[j]) { return false; }
 
     if (has_nulls) {
-      bool const nullable = d_view.nullable();
-      bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)};
-      bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)};
-      if (lhs_is_null and rhs_is_null) {
+      bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)};
+      bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)};
+      if (lhs_is_null && rhs_is_null) {
         return nulls_equal == null_equality::EQUAL;
       } else if (lhs_is_null != rhs_is_null) {
         return false;
       }
     }
 
-    // For floating-point types, if both element(i) and element(j) are NaNs then this comparison
-    // will return `false`. This is the desired behavior in Apache Spark.
-    return d_view.element<Type>(i) == d_view.element<Type>(j);
+    return compare<Type>(lhs.element<Type>(i), lhs.element<Type>(j));
+  }
+};
+
+/**
+ * @brief Struct used in type_dispatcher for comparing two entries in a lists column.
+ */
+struct column_row_comparator_dispatch {
+  offset_type const* const list_offsets;
+  column_device_view const lhs;
+  column_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  __device__ column_row_comparator_dispatch(offset_type const* const list_offsets,
+                                            column_device_view const& lhs,
+                                            column_device_view const& rhs,
+                                            null_equality const nulls_equal,
+                                            bool const has_nulls,
+                                            bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
+  {
+  }
+
+  template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+  bool __device__ operator()(size_type i, size_type j) const noexcept
+  {
+    return column_row_comparator_fn<Type>{
+      list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j);
+  }
+
+  template <class Type, std::enable_if_t<!cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+  bool operator()(size_type i, size_type j) const
+  {
+    CUDF_FAIL(
+      "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable.");
   }
+};
 
- private:
-  offset_type const* list_offsets;
-  column_device_view d_view;
-  null_equality nulls_equal;
-  bool has_nulls;
+/**
+ * @brief Performs an equality comparison between rows of two tables using `column_row_comparator`
+ * to compare rows of their corresponding columns.
+ */
+struct table_row_comparator_fn {
+  offset_type const* const list_offsets;
+  table_device_view const lhs;
+  table_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  table_row_comparator_fn(offset_type const* const list_offsets,
+                          table_device_view const& lhs,
+                          table_device_view const& rhs,
+                          null_equality const nulls_equal,
+                          bool const has_nulls,
+                          bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
+  {
+  }
+
+  bool __device__ operator()(size_type i, size_type j) const noexcept
+  {
+    auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) {
+      return type_dispatcher(
+        lhs.type(),
+        column_row_comparator_dispatch{list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal},
+        i,
+        j);
+    };
+
+    return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp);
+  }
 };
 
 /**
- *  @brief Construct type-dispatched function object for copying indices of the list entries
- * ignoring duplicates
+ *  @brief Struct used in type_dispatcher for copying indices of the list entries ignoring
+ * duplicates.
  */
-struct get_unique_entries_fn {
-  template <class Type, std::enable_if_t<not cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+struct get_unique_entries_dispatch {
+  template <class Type,
+            std::enable_if_t<!cudf::is_equality_comparable<Type, Type>() &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
   offset_type* operator()(offset_type const*,
-                          column_device_view&,
+                          column_view const&,
                           size_type,
                           offset_type*,
                           null_equality,
@@ -301,12 +441,13 @@ struct get_unique_entries_fn {
                           bool,
                           rmm::cuda_stream_view) const
   {
-    CUDF_FAIL("Cannot operate on types that are not equally comparable.");
+    CUDF_FAIL(
+      "`get_unique_entries_dispatch` cannot operate on types that are not equally comparable.");
   }
 
   template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
   offset_type* operator()(offset_type const* list_offsets,
-                          column_device_view& d_view,
+                          column_view const& all_lists_entries,
                           size_type num_entries,
                           offset_type* output_begin,
                           null_equality nulls_equal,
@@ -314,41 +455,69 @@ struct get_unique_entries_fn {
                           bool has_nulls,
                           rmm::cuda_stream_view stream) const noexcept
   {
-    if (nans_equal == nan_equality::ALL_EQUAL) {
-      list_entry_comparator<Type, true> const comp{list_offsets, d_view, nulls_equal, has_nulls};
-      return thrust::unique_copy(rmm::exec_policy(stream),
-                                 thrust::make_counting_iterator(0),
-                                 thrust::make_counting_iterator(num_entries),
-                                 output_begin,
-                                 comp);
-    } else {
-      list_entry_comparator<Type, false> const comp{list_offsets, d_view, nulls_equal, has_nulls};
-      return thrust::unique_copy(rmm::exec_policy(stream),
-                                 thrust::make_counting_iterator(0),
-                                 thrust::make_counting_iterator(num_entries),
-                                 output_begin,
-                                 comp);
-    }
+    auto const d_view = column_device_view::create(all_lists_entries, stream);
+    auto const comp   = column_row_comparator_fn<Type>{list_offsets,
+                                                     *d_view,
+                                                     *d_view,
+                                                     nulls_equal,
+                                                     has_nulls,
+                                                     nans_equal == nan_equality::ALL_EQUAL};
+    return thrust::unique_copy(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(num_entries),
+                               output_begin,
+                               comp);
+  }
+
+  template <class Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  offset_type* operator()(offset_type const* list_offsets,
+                          column_view const& all_lists_entries,
+                          size_type num_entries,
+                          offset_type* output_begin,
+                          null_equality nulls_equal,
+                          nan_equality nans_equal,
+                          bool has_nulls,
+                          rmm::cuda_stream_view stream) const noexcept
+  {
+    auto const entries_tview       = table_view{{all_lists_entries}};
+    auto const flatten_nullability = has_nested_nulls(entries_tview)
+                                       ? structs::detail::column_nullability::FORCE
+                                       : structs::detail::column_nullability::MATCH_INCOMING;
+    auto const entries_flattened   = cudf::structs::detail::flatten_nested_columns(
+      entries_tview, {order::ASCENDING}, {null_order::AFTER}, flatten_nullability);
+    auto const d_view = table_device_view::create(std::get<0>(entries_flattened), stream);
+
+    auto const comp = table_row_comparator_fn{list_offsets,
+                                              *d_view,
+                                              *d_view,
+                                              nulls_equal,
+                                              has_nulls,
+                                              nans_equal == nan_equality::ALL_EQUAL};
+
+    return thrust::unique_copy(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(num_entries),
+                               output_begin,
+                               comp);
   }
 };
 
 /**
- * @brief Copy list entries and entry list offsets ignoring duplicates
+ * @brief Copy list entries and entry list offsets ignoring duplicates.
  *
  * Given an array of all entries flattened from a list column and an array that maps each entry to
  * the offset of the list containing that entry, those entries and list offsets are copied into
  * new arrays such that the duplicated entries within each list will be ignored.
  *
- * @param all_lists_entries    The input array containing all list entries
- * @param entries_list_offsets A map from list entries to their corresponding list offsets
- * @param nulls_equal          Flag to specify whether null entries should be considered equal
- * @param nans_equal           Flag to specify whether NaN entries should be considered as equal
- * value (only applicable for floating-point data column)
- * @param stream               CUDA stream used for device memory operations and kernel launches
- * @param mr                   Device resource used to allocate memory
- *
+ * @param all_lists_entries The input array containing all list entries.
+ * @param entries_list_offsets A map from list entries to their corresponding list offsets.
+ * @param nulls_equal Flag to specify whether null entries should be considered equal.
+ * @param nans_equal Flag to specify whether NaN entries should be considered equal
+ *        (only applicable for floating-point data column).
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
  * @return A pair of columns, the first one contains unique list entries and the second one
- * contains their corresponding list offsets
+ *         contains their corresponding list offsets.
  */
 std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
   column_view const& all_lists_entries,
@@ -358,16 +527,15 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const num_entries    = all_lists_entries.size();
-  auto const d_view_entries = column_device_view::create(all_lists_entries, stream);
+  auto const num_entries = all_lists_entries.size();
 
-  // Allocate memory to store the indices of the unique entries
+  // Allocate memory to store the indices of the unique entries.
   auto unique_indices     = rmm::device_uvector<offset_type>(num_entries, stream);
   auto const output_begin = unique_indices.begin();
   auto const output_end   = type_dispatcher(all_lists_entries.type(),
-                                          get_unique_entries_fn{},
+                                          get_unique_entries_dispatch{},
                                           entries_list_offsets.begin<offset_type>(),
-                                          *d_view_entries,
+                                          all_lists_entries,
                                           num_entries,
                                           output_begin,
                                           nulls_equal,
@@ -375,9 +543,9 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
                                           all_lists_entries.has_nulls(),
                                           stream);
 
-  // Collect unique entries and entry list offsets
+  // Collect unique entries and entry list offsets.
   // The new null_count and bitmask of the unique entries will also be generated
-  // by the gather function
+  // by the gather function.
   return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}},
                               output_begin,
                               output_end,
@@ -388,27 +556,27 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
 }
 
 /**
- * @brief Generate list offsets from entry offsets
+ * @brief Generate list offsets from entry offsets.
  *
- * Generate an array of list offsets for the final result lists column. The list
- * offsets of the original lists column are also taken into account to make sure the result lists
- * column will have the same empty list rows (if any) as in the original lists column.
+ * Generate an array of list offsets for the final result lists column. The list offsets of the
+ * original lists column are also taken into account to make sure the result lists column will have
+ * the same empty list rows (if any) as in the original lists column.
  *
- * @param[in] num_entries          The number of unique entries after removing duplicates
- * @param[in] entries_list_offsets The mapping from list entries to their list offsets
- * @param[out] original_offsets    The list offsets of the original lists column, which
- * will also be used to store the new list offsets
- * @param[in] stream               CUDA stream used for device memory operations and kernel launches
- * @param[in] mr                   Device resource used to allocate memory
+ * @param num_entries The number of unique entries after removing duplicates.
+ * @param entries_list_offsets The mapping from list entries to their list offsets.
+ * @param original_offsets The list offsets of the original lists column, which will also be used to
+ *        store the new list offsets.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
  */
 void generate_offsets(size_type num_entries,
                       column_view const& entries_list_offsets,
                       mutable_column_view const& original_offsets,
                       rmm::cuda_stream_view stream)
 {
-  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any)
+  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any).
   // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10,
-  // then new_offsets = { 0, 4, 5, 8, 10 }
+  // then new_offsets = { 0, 4, 5, 8, 10 }.
   auto const new_offsets = allocate_like(
     original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource());
   thrust::copy_if(rmm::exec_policy(stream),
@@ -421,10 +589,9 @@ void generate_offsets(size_type num_entries,
                   });
 
   // Generate a prefix sum of number of empty lists, storing inplace to the original lists
-  // offsets
+  // offsets.
   // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists),
-  // and new_offsets = { 0, 4, 6 },
-  // then output = { 0, 1, 1, 2, 2, 3}
+  // and new_offsets = { 0, 4, 6 }, then output = { 0, 1, 1, 2, 2, 3}.
   auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator(
     0, [offsets = original_offsets.begin<offset_type>()] __device__(auto i) {
       return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0;
@@ -434,10 +601,10 @@ void generate_offsets(size_type num_entries,
                          iter_trans_begin + original_offsets.size(),
                          original_offsets.begin<offset_type>());
 
-  // Generate the final list offsets
+  // Generate the final list offsets.
   // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 },
-  //  and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
-  //  then output = { 0, 0, 4, 4, 5, 5 }
+  // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
+  // then output = { 0, 0, 4, 4, 5, 5 }.
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<offset_type>(0),
                     thrust::make_counting_iterator<offset_type>(original_offsets.size()),
@@ -453,7 +620,7 @@ void generate_offsets(size_type num_entries,
 /**
  * @copydoc cudf::lists::drop_list_duplicates
  *
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
                                              null_equality nulls_equal,
@@ -462,22 +629,23 @@ std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_colu
                                              rmm::mr::device_memory_resource* mr)
 {
   if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent());
-  if (cudf::is_nested(lists_column.child().type())) {
-    CUDF_FAIL("Nested types are not supported in drop_list_duplicates.");
+  if (auto const child_type = lists_column.child().type();
+      cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) {
+    CUDF_FAIL("Nested types other than STRUCT are not supported in `drop_list_duplicates`.");
   }
 
-  // Flatten all entries (depth = 1) of the lists column
+  // Flatten all entries (depth = 1) of the lists column.
   auto const lists_entries = lists_column.get_sliced_child(stream);
 
-  // sorted_lists will store the results of the original lists after calling segmented_sort
+  // sorted_lists will store the results of the original lists after calling segmented_sort.
   auto const sorted_lists = [&]() {
     // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type,
-    // we need to replace -NaN by NaN before sorting
+    // we need to replace -NaN by NaN before sorting.
     auto const replace_negative_nan =
-      nans_equal == nan_equality::ALL_EQUAL and
-      type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream);
+      nans_equal == nan_equality::ALL_EQUAL &&
+      type_dispatcher(
+        lists_entries.type(), detail::has_negative_nans_dispatch{}, lists_entries, stream);
     if (replace_negative_nan) {
-      // The column new_lists_column is temporary, thus we will not pass in `mr`
       auto const new_lists_column =
         detail::replace_negative_nans_entries(lists_entries, lists_column, stream);
       return detail::sort_lists(
@@ -490,28 +658,28 @@ std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_colu
   auto const sorted_lists_entries =
     lists_column_view(sorted_lists->view()).get_sliced_child(stream);
 
-  // Generate a 0-based offset column
+  // Generate a 0-based offset column.
   auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr);
 
-  // Generate a mapping from list entries to offsets of the lists containing those entries
+  // Generate a mapping from list entries to offsets of the lists containing those entries.
   auto const entries_list_offsets =
     detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream);
 
-  // Copy non-duplicated entries (along with their list offsets) to new arrays
+  // Copy non-duplicated entries (along with their list offsets) to new arrays.
   auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets(
     sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr);
 
-  // Generate offsets for the new lists column
+  // Generate offsets for the new lists column.
   detail::generate_offsets(unique_entries_and_list_offsets.front()->size(),
                            unique_entries_and_list_offsets.back()->view(),
                            lists_offsets->mutable_view(),
                            stream);
 
-  // Construct a new lists column without duplicated entries
+  // Construct a new lists column without duplicated entries.
   // Reuse the null_count and bitmask of the lists_column: those are the null information for
-  // the list elements (rows)
+  // the list elements (rows).
   // For the entries of those lists (rows), their null_count and bitmask were generated separately
-  // during the step `get_unique_entries_and_list_offsets` above
+  // during the step `get_unique_entries_and_list_offsets` above.
   return make_lists_column(lists_column.size(),
                            std::move(lists_offsets),
                            std::move(unique_entries_and_list_offsets.front()),
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 4d1d6448dd0..b9b73d98ed2 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -169,6 +169,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const start_str_idx = list_offsets[list_id];
     auto const end_str_idx   = list_offsets[list_id + 1];
 
+    // In case of empty list (i.e. it doesn't contain any string element), we just ignore it because
+    // there will not be anything to store for that list in the child column.
+    if (start_str_idx == end_str_idx) { return; }
+
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
new file mode 100644
index 00000000000..9aea59a195b
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+
+namespace cudf {
+namespace detail {
+namespace tdigest {
+
+// https://developer.nvidia.com/blog/lerp-faster-cuda/
+template <typename T>
+__device__ inline T lerp(T v0, T v1, T t)
+{
+  return fma(t, v1, fma(-t, v0, v0));
+}
+
+struct centroid {
+  double mean;
+  double weight;
+};
+
+struct make_centroid {
+  double const* means;
+  double const* weights;
+  __device__ centroid operator()(size_type i) { return {means[i], weights[i]}; }
+};
+
+// kernel for computing percentiles on input tdigest (mean, weight) centroid data.
+template <typename CentroidIter>
+__global__ void compute_percentiles_kernel(device_span<offset_type const> tdigest_offsets,
+                                           column_device_view percentiles,
+                                           CentroidIter centroids_,
+                                           double const* min_,
+                                           double const* max_,
+                                           double const* cumulative_weight_,
+                                           double* output)
+{
+  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto const num_tdigests  = tdigest_offsets.size() - 1;
+  auto const tdigest_index = tid / percentiles.size();
+  if (tdigest_index >= num_tdigests) { return; }
+  auto const pindex = tid % percentiles.size();
+
+  // size of the digest we're querying
+  auto const tdigest_size = tdigest_offsets[tdigest_index + 1] - tdigest_offsets[tdigest_index];
+  // no work to do. values will be set to null
+  if (tdigest_size == 0 || !percentiles.is_valid(pindex)) { return; }
+
+  output[tid] = [&]() {
+    double const percentage         = percentiles.element<double>(pindex);
+    double const* cumulative_weight = cumulative_weight_ + tdigest_offsets[tdigest_index];
+
+    // centroids for this particular tdigest
+    CentroidIter centroids = centroids_ + tdigest_offsets[tdigest_index];
+
+    // min and max for the digest
+    double const* min_val = min_ + tdigest_index;
+    double const* max_val = max_ + tdigest_index;
+
+    double const total_weight = cumulative_weight[tdigest_size - 1];
+
+    // The following Arrow code serves as a basis for this computation
+    // https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/tdigest.cc#L280
+    double const weighted_q = percentage * total_weight;
+    if (weighted_q <= 1) {
+      return *min_val;
+    } else if (weighted_q >= total_weight - 1) {
+      return *max_val;
+    }
+
+    // determine what centroid this weighted quantile falls within.
+    size_type const centroid_index = static_cast<size_type>(thrust::distance(
+      cumulative_weight,
+      thrust::lower_bound(
+        thrust::seq, cumulative_weight, cumulative_weight + tdigest_size, weighted_q)));
+    centroid c                     = centroids[centroid_index];
+
+    // diff == how far from the "center" of the centroid we are,
+    // in unit weights.
+    // visually:
+    //
+    // centroid of weight 7
+    //        C       <-- center of the centroid
+    //    |-------|
+    //      | |  |
+    //      X Y  Z
+    // X has a diff of -2 (2 units to the left of the center of the centroid)
+    // Y has a diff of 0 (directly in the middle of the centroid)
+    // Z has a diff of 3 (3 units to the right of the center of the centroid)
+    double const diff = weighted_q + c.weight / 2 - cumulative_weight[centroid_index];
+
+    // if we're completely within a centroid of weight 1, just return that.
+    if (c.weight == 1 && std::abs(diff) < 0.5) { return c.mean; }
+
+    // otherwise, interpolate between two centroids.
+
+    // get the two centroids we want to interpolate between
+    auto const look_left  = diff < 0;
+    auto const [lhs, rhs] = [&]() {
+      if (look_left) {
+        // if we're at the first centroid, "left" of us is the min value
+        auto const first_centroid = centroid_index == 0;
+        auto const lhs = first_centroid ? centroid{*min_val, 0} : centroids[centroid_index - 1];
+        auto const rhs = c;
+        return std::pair<centroid, centroid>{lhs, rhs};
+      } else {
+        // if we're at the last centroid, "right" of us is the max value
+        auto const last_centroid = (centroid_index == tdigest_size - 1);
+        auto const lhs           = c;
+        auto const rhs = last_centroid ? centroid{*max_val, 0} : centroids[centroid_index + 1];
+        return std::pair<centroid, centroid>{lhs, rhs};
+      }
+    }();
+
+    // compute interpolation value t
+
+    // total interpolation range. the total range of "space" between the lhs and rhs centroids.
+    auto const tip = lhs.weight / 2 + rhs.weight / 2;
+    // if we're looking left, diff is negative, so shift it so that we are interpolating
+    // from lhs -> rhs.
+    auto const t = (look_left) ? (diff + tip) / tip : diff / tip;
+
+    // interpolate
+    return lerp(lhs.mean, rhs.mean, t);
+  }();
+}
+
+/**
+ * @brief Calculate approximate percentiles on a provided tdigest column.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest of from row `i` in `input`. The length of each output list
+ * is the number of percentiles specified in `percentiles`
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param stream          CUDA stream used for device memory operations and kernel launches
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @returns Column of doubles containing requested percentile values.
+ */
+std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& input,
+                                                   column_view const& percentiles,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view lcv(input.child(centroid_column_index));
+  column_view min_col = input.child(min_column_index);
+  column_view max_col = input.child(max_column_index);
+
+  // offsets, representing the size of each tdigest
+  auto offsets = lcv.offsets();
+
+  // extract means and weights
+  auto data = lcv.parent().child(lists_column_view::child_column_index);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(mean_column_index);
+  auto weight = tdigest.child(weight_column_index);
+
+  // compute summed weights
+  auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64},
+                                                          mean.size(),
+                                                          mask_state::UNALLOCATED,
+                                                          stream,
+                                                          rmm::mr::get_current_device_resource());
+  auto keys               = cudf::detail::make_counting_transform_iterator(
+    0,
+    [offsets_begin = offsets.begin<offset_type>(),
+     offsets_end   = offsets.end<offset_type>()] __device__(size_type i) {
+      return thrust::distance(
+        offsets_begin,
+        thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i)));
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + weight.size(),
+                                weight.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto percentiles_cdv = column_device_view::create(percentiles);
+
+  // leaf is a column of size input.size() * percentiles.size()
+  auto const num_output_values = input.size() * percentiles.size();
+
+  // null percentiles become null results.
+  auto [null_mask, null_count] = [&]() {
+    return percentiles.null_count() != 0
+             ? cudf::detail::valid_if(
+                 thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(0) + num_output_values,
+                 [percentiles = *percentiles_cdv] __device__(size_type i) {
+                   return percentiles.is_valid(i % percentiles.size());
+                 })
+             : std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, 0};
+  }();
+
+  auto result = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr);
+
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0, make_centroid{mean.begin<double>(), weight.begin<double>()});
+
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
+  compute_percentiles_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
+    *percentiles_cdv,
+    centroids,
+    min_col.begin<double>(),
+    max_col.begin<double>(),
+    cumulative_weights->view().begin<double>(),
+    result->mutable_view().begin<double>());
+
+  return result;
+}
+
+void check_is_valid_tdigest_column(column_view const& col)
+{
+  // sanity check that this is actually tdigest data
+  CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows");
+  CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
+  CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column");
+
+  structs_column_view scv(col);
+  CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+
+  lists_column_view lcv(scv.child(centroid_column_index));
+  auto data = lcv.child();
+  CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(data.num_children() == 2,
+               "Encountered tdigest column with an invalid number of children");
+  auto mean = data.child(mean_column_index);
+  CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column");
+  auto weight = data.child(weight_column_index);
+  CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column");
+}
+
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  // mean/weight columns
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+
+  auto offsets = cudf::make_fixed_width_column(
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               offsets->mutable_view().begin<offset_type>(),
+               offsets->mutable_view().end<offset_type>(),
+               0);
+  auto list =
+    make_lists_column(1,
+                      std::move(offsets),
+                      cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr),
+                      0,
+                      {});
+
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               0);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               0);
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+
+  return make_structs_column(1, std::move(children), 0, {}, stream, mr);
+}
+
+}  // namespace tdigest.
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  tdigest::check_is_valid_tdigest_column(input);
+  CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
+               "percentile_approx expects float64 percentile inputs");
+
+  // output is a list column with each row containing percentiles.size() percentile values
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto row_size_iter = thrust::make_constant_iterator(percentiles.size());
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         row_size_iter,
+                         row_size_iter + input.size() + 1,
+                         offsets->mutable_view().begin<offset_type>());
+
+  if (percentiles.size() == 0) {
+    return cudf::make_lists_column(
+      input.size(),
+      std::move(offsets),
+      cudf::make_empty_column(data_type{type_id::FLOAT64}),
+      input.size(),
+      cudf::detail::create_null_mask(
+        input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr));
+  }
+
+  // if any of the input digests are empty, nullify the corresponding output rows (values will be
+  // uninitialized)
+  auto [bitmask, null_count] = [stream, mr, input]() {
+    lists_column_view lcv(input.child(tdigest::centroid_column_index));
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 1 : 0;
+      });
+    auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0);
+    if (null_count == 0) {
+      return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
+    }
+    return cudf::detail::valid_if(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + input.size(),
+      [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 0 : 1;
+      },
+      stream,
+      mr);
+  }();
+
+  return cudf::make_lists_column(
+    input.size(),
+    std::move(offsets),
+    tdigest::compute_approx_percentiles(input, percentiles, stream, mr),
+    null_count,
+    std::move(bitmask),
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  return percentile_approx(input, percentiles, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index dc74a5f4ff1..42b57bdb47a 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,7 +26,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
@@ -75,7 +75,7 @@ struct inplace_column_sort_fn {
   }
 };
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
@@ -101,7 +101,7 @@ std::unique_ptr<table> sort(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::mr::device_memory_resource* mr)
@@ -110,7 +110,7 @@ std::unique_ptr<column> sorted_order(table_view input,
   return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 860e88ae76e..75335579de2 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -25,7 +25,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
@@ -36,7 +36,7 @@ std::unique_ptr<column> stable_sorted_order(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9553d463ab..6d385ff969d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/count_tests.cpp
     groupby/groups_tests.cpp
     groupby/keys_tests.cpp
+    groupby/lists_tests.cpp
     groupby/m2_tests.cpp
     groupby/min_tests.cpp
     groupby/max_scan_tests.cpp
@@ -84,6 +85,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/sum_of_squares_tests.cpp
     groupby/sum_scan_tests.cpp
     groupby/sum_tests.cpp
+    groupby/tdigest_tests.cu
     groupby/var_tests.cpp)
 
 ###################################################################################################
@@ -122,6 +124,7 @@ ConfigureTest(HASH_MAP_TEST
 ###################################################################################################
 # - quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(QUANTILES_TEST
+    quantiles/percentile_approx_test.cu
     quantiles/quantile_test.cpp
     quantiles/quantiles_test.cpp)
 
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 542205b5b51..b333d9dacba 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -27,6 +27,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <random>
 
 namespace cudf {
 namespace test {
@@ -128,5 +131,57 @@ inline void test_single_scan(column_view const& keys,
     expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
 }
 
+template <typename T>
+inline T frand()
+{
+  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+}
+
+template <typename T>
+inline T rand_range(T min, T max)
+{
+  return min + static_cast<T>(frand<T>() * (max - min));
+}
+
+inline std::unique_ptr<column> generate_typed_percentile_distribution(
+  std::vector<double> const& buckets,
+  std::vector<int> const& sizes,
+  data_type t,
+  bool sorted = false)
+{
+  srand(0);
+
+  std::vector<double> values;
+  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
+  values.reserve(total_size);
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    double min = idx == 0 ? 0.0f : buckets[idx - 1];
+    double max = buckets[idx];
+
+    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
+      values.push_back(rand_range(min, max));
+    }
+  }
+
+  if (sorted) { std::sort(values.begin(), values.end()); }
+
+  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
+  return cudf::cast(src, t);
+}
+
+// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
+// is to provide a standardized set of inputs for use with tdigest generation tests and
+// percentile_approx tests. std::vector<double>
+// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
+// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+inline std::unique_ptr<column> generate_standardized_percentile_distribution(
+  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
+{
+  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
+  std::vector<int> b_sizes{
+    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/lists_tests.cpp b/cpp/tests/groupby/lists_tests.cpp
new file mode 100644
index 00000000000..11b8ffa92b9
--- /dev/null
+++ b/cpp/tests/groupby/lists_tests.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+
+template <typename V>
+struct groupby_lists_test : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes);
+
+namespace {
+// Checking with a single aggregation, and aggregation column.
+// This test is orthogonal to the aggregation type; it focuses on testing the grouping
+// with LISTS keys.
+auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }
+
+void test_sort_based_sum_agg(column_view const& keys, column_view const& values)
+{
+  test_single_agg(
+    keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE);
+}
+
+void test_hash_based_sum_agg(column_view const& keys, column_view const& values)
+{
+  test_single_agg(
+    keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE);
+}
+
+}  // namespace
+
+TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported)
+{
+  // Test that grouping on LISTS columns fails visibly.
+
+  // clang-format off
+  auto keys   = lists_column_wrapper<TypeParam, int32_t> { {1,1},  {2,2},  {3,3},   {1,1},   {2,2} };
+  auto values = fixed_width_column_wrapper<int32_t>      {     0,      1,      2,      3,       4  };
+  // clang-format on
+
+  EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error);
+  EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index 613e1555b79..d390c8a1880 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -160,5 +160,57 @@ TEST_F(groupby_dictionary_mean_test, basic)
     keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation<groupby_aggregation>());
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMeanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale};
+
+    auto agg = cudf::make_mean_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(
+      keys, vals, expect_keys, expect_vals_min, std::move(agg), force_use_sort_impl::YES);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, GroupByHashMeanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale};
+
+    auto agg = cudf::make_mean_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg));
+  }
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
index 00126a4a5a0..3715ba8d17b 100644
--- a/cpp/tests/groupby/structs_tests.cpp
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -22,8 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include "cudf/aggregation.hpp"
-#include "cudf/types.hpp"
 
 using namespace cudf::test::iterators;
 
@@ -34,7 +32,7 @@ template <typename V>
 struct groupby_structs_test : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_structs_test, cudf::test::FixedWidthTypes);
 
 using V       = int32_t;  // Type of Aggregation Column.
 using M0      = int32_t;  // Type of STRUCT's first (i.e. 0th) member.
@@ -79,27 +77,43 @@ void print_agg_results(column_view const& keys, column_view const& vals)
   }
 }
 
-void test_sum_agg(column_view const& keys,
-                  column_view const& values,
-                  column_view const& expected_keys,
-                  column_view const& expected_values)
+void test_sort_based_sum_agg(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expected_keys,
+                             column_view const& expected_values)
 {
   test_single_agg(keys,
                   values,
                   expected_keys,
                   expected_values,
                   sum_agg(),
-                  force_use_sort_impl::NO,
+                  force_use_sort_impl::YES,
                   null_policy::INCLUDE);
+}
+
+void test_hash_based_sum_agg(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expected_keys,
+                             column_view const& expected_values)
+{
   test_single_agg(keys,
                   values,
                   expected_keys,
                   expected_values,
                   sum_agg(),
-                  force_use_sort_impl::YES,
+                  force_use_sort_impl::NO,
                   null_policy::INCLUDE);
 }
 
+void test_sum_agg(column_view const& keys,
+                  column_view const& values,
+                  column_view const& expected_keys,
+                  column_view const& expected_values)
+{
+  test_sort_based_sum_agg(keys, values, expected_keys, expected_values);
+  test_hash_based_sum_agg(keys, values, expected_keys, expected_values);
+}
+
 }  // namespace
 
 TYPED_TEST(groupby_structs_test, basic)
@@ -312,7 +326,8 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported)
   // clang-format on
   auto keys = structs{{member_0, member_1}};
 
-  EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error);
+  EXPECT_THROW(test_sort_based_sum_agg(keys, values, keys, values), cudf::logic_error);
+  EXPECT_THROW(test_hash_based_sum_agg(keys, values, keys, values), cudf::logic_error);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
new file mode 100644
index 00000000000..818999867c1
--- /dev/null
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow/util/tdigest.h"
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <thrust/fill.h>
+
+namespace cudf {
+namespace test {
+
+using namespace cudf;
+
+typedef thrust::tuple<size_type, double, double> expected_value;
+
+template <typename T>
+struct TDigestAllTypes : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes);
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+void tdigest_sample_compare(column_view const& result,
+                            std::vector<expected_value> const& h_expected)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(result);
+  cudf::structs_column_view scv(result);
+  cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  cudf::structs_column_view tdigests(lcv.child());
+  column_view result_mean   = tdigests.child(cudf::detail::tdigest::mean_column_index);
+  column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index);
+
+  auto expected_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto expected_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+
+  rmm::device_vector<expected_value> expected(h_expected.begin(), h_expected.end());
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(rmm::cuda_stream_default),
+    iter,
+    iter + expected.size(),
+    [expected            = expected.data().get(),
+     expected_mean       = expected_mean->mutable_view().begin<double>(),
+     expected_weight     = expected_weight->mutable_view().begin<double>(),
+     result_mean         = result_mean.begin<double>(),
+     result_weight       = result_weight.begin<double>(),
+     sampled_result_mean = sampled_result_mean->mutable_view().begin<double>(),
+     sampled_result_weight =
+       sampled_result_weight->mutable_view().begin<double>()] __device__(size_type index) {
+      expected_mean[index]         = thrust::get<1>(expected[index]);
+      expected_weight[index]       = thrust::get<2>(expected[index]);
+      auto const src_index         = thrust::get<0>(expected[index]);
+      sampled_result_mean[index]   = result_mean[src_index];
+      sampled_result_weight[index] = result_weight[src_index];
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_mean, *sampled_result_mean);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_weight, *sampled_result_weight);
+}
+
+template <typename T>
+std::unique_ptr<column> make_expected_tdigest(column_view const& mean,
+                                              column_view const& weight,
+                                              T min,
+                                              T max)
+{
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(std::make_unique<cudf::column>(mean));
+  inner_children.push_back(std::make_unique<cudf::column>(weight));
+  // tdigest struct
+  auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {});
+
+  std::vector<offset_type> h_offsets{0, mean.size()};
+  auto offsets =
+    cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
+  cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+             h_offsets.data(),
+             sizeof(offset_type) * 2,
+             cudaMemcpyHostToDevice);
+
+  auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
+
+  auto min_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               static_cast<double>(min));
+  auto max_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               static_cast<double>(max));
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(1, std::move(children), 0, {});
+}
+
+TYPED_TEST(TDigestAllTypes, Simple)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{126, 15, 1, 99, 67};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 15, 67, 99, 126});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 126;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, SimpleWithNulls)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 44, 67, 100, 122});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 122;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, AllNull)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  // NOTE: an empty tdigest column still has 1 row.
+  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, LargeGroups)
+{
+  auto _values    = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  int const delta = 1000;
+
+  // generate a random set of keys
+  std::vector<int> h_keys;
+  h_keys.reserve(_values->size());
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(iter, iter + _values->size(), std::back_inserter(h_keys), [](int i) {
+    return static_cast<int>(round(rand_range(0, 8)));
+  });
+  cudf::test::fixed_width_column_wrapper<int> _keys(h_keys.begin(), h_keys.end());
+
+  // group the input values together
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby setup_gb(k);
+  cudf::table_view v({*_values});
+  auto groups = setup_gb.get_groups(v);
+
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  // generate a seperate tdigest for each group
+  std::vector<std::unique_ptr<column>> parts;
+  std::transform(
+    iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) {
+      cudf::table_view t({keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+  auto merged_parts = cudf::concatenate(part_views);
+
+  // generate a tdigest on the whole input set
+  cudf::table_view t({_keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({*_values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  // verify that they end up the same.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *merged_parts);
+}
+
+struct TDigestTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(TDigestTest, LargeInputDouble)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00040692343794663995, 7},
+                                         {10, 0.16234555627091204477, 153},
+                                         {59, 5.12764811246045937310, 858},
+                                         {250, 62.54581814492237157310, 2356},
+                                         {368, 87.85834376680742252574, 1735},
+                                         {409, 94.07685720279611985006, 1272},
+                                         {491, 99.94197663121231300920, 130},
+                                         {500, 99.99969880795092080916, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07265722021410986331, 739},
+                                         {7, 8.19766194442652640362, 10693},
+                                         {16, 36.82277869518204482802, 20276},
+                                         {29, 72.95424834129075009059, 22623},
+                                         {38, 90.61229683516096145013, 15581},
+                                         {46, 99.07283498858802772702, 5142},
+                                         {50, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15508346777729631327, 71618},
+                                         {1, 33.04971680740474226923, 187499},
+                                         {2, 62.50566666553867634093, 231762},
+                                         {3, 83.46216572053654658703, 187500},
+                                         {4, 96.42204425201593664951, 71620},
+                                         {5, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputInt)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::INT32});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 7},
+                                         {14, 0, 212},
+                                         {26, 0.83247422680412408447, 388},
+                                         {44, 2, 648},
+                                         {45, 2.42598187311178170589, 662},
+                                         {342, 82.75190258751908345403, 1971},
+                                         {383, 90, 1577},
+                                         {417, 94.88376068376066996279, 1170},
+                                         {418, 95, 1157},
+                                         {479, 99, 307},
+                                         {500, 99, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 739},
+                                         {7, 7.71486018890863167741, 10693},
+                                         {16, 36.32491615703294485229, 20276},
+                                         {29, 72.44392874508245938614, 22623},
+                                         {38, 90.14209614273795523332, 15581},
+                                         {46, 98.64041229093737683797, 5142},
+                                         {50, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 6.66025300902007799664, 71618},
+                                         {1, 32.54912826201739051157, 187499},
+                                         {2, 62.00734805533262772315, 231762},
+                                         {3, 82.96355733333332693746, 187500},
+                                         {4, 95.91280368612116546956, 71620},
+                                         {5, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputDecimal)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00035714285714285709, 7},
+                                         {10, 0.16229738562091505782, 153},
+                                         {59, 5.12759696969697031932, 858},
+                                         {250, 62.54576854838715860296, 2356},
+                                         {368, 87.85829446685879418055, 1735},
+                                         {409, 94.07680636792450457051, 1272},
+                                         {491, 99.94192461538463589932, 130},
+                                         {500, 99.99965000000000259206, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07260811907983763525, 739},
+                                         {7, 8.19761183016926864298, 10693},
+                                         {16, 36.82272891595975750079, 20276},
+                                         {29, 72.95419827167043536065, 22623},
+                                         {38, 90.61224673640975879607, 15581},
+                                         {46, 99.07278498638662256326, 5142},
+                                         {50, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15503361864335740705, 71618},
+                                         {1, 33.04966679715625588187, 187499},
+                                         {2, 62.50561666407782013266, 231762},
+                                         {3, 83.46211575573336460820, 187500},
+                                         {4, 96.42199425300195514410, 71620},
+                                         {5, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+struct TDigestMergeTest : public cudf::test::BaseFixture {
+};
+
+// Note: there is no need to test different types here as the internals of a tdigest are always
+// the same regardless of input.
+TEST_F(TDigestMergeTest, Simple)
+{
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  auto split_values = cudf::split(*values, {250000, 500000});
+  auto split_keys   = cudf::split(*keys, {250000, 500000});
+
+  int const delta = 1000;
+
+  // generate seperate digests
+  std::vector<std::unique_ptr<column>> parts;
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(
+    iter,
+    iter + split_values.size(),
+    std::back_inserter(parts),
+    [&split_keys, &split_values, delta](int i) {
+      cudf::table_view t({split_keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({split_values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+
+  // merge delta = 1000
+  {
+    int const merge_delta = 1000;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 0, 0};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+
+    std::vector<expected_value> expected{{0, 0.00013945158577498588, 2},
+                                         {10, 0.04804393446447510763, 50},
+                                         {59, 1.68846964439246893797, 284},
+                                         {250, 33.36323141295877547918, 1479},
+                                         {368, 65.36307727957283475462, 2292},
+                                         {409, 73.95399208218296394080, 1784},
+                                         {490, 87.67566167909056673579, 1570},
+                                         {491, 87.83119717763385381204, 1570},
+                                         {500, 89.24891838334393412424, 1555},
+                                         {578, 95.87182997389099625707, 583},
+                                         {625, 98.20470345147104751504, 405},
+                                         {700, 99.96818381983835877236, 56},
+                                         {711, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result.second[0].results[0], expected);
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
new file mode 100644
index 00000000000..39617c99690
--- /dev/null
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/io_metadata_utilities.hpp>
+
+#include <gmock/gmock.h>
+
+namespace cudf::test {
+
+void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
+                           cudf::io::table_metadata out_meta)
+{
+  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
+    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
+      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
+      ASSERT_EQ(out_col.children.size(), in_col.num_children());
+      for (size_t i = 0; i < out_col.children.size(); ++i) {
+        compare_names(out_col.children[i], in_col.child(i));
+      }
+    };
+
+  ASSERT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
+
+  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
+    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
+  }
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index fbeba925f1b..cdf0a3b275b 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -161,14 +162,10 @@ struct SkipRowTest {
     auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
     column_wrapper<int32_t, typename decltype(sequence)::value_type> input_col(
       sequence, sequence + file_num_rows, validity);
-
-    std::vector<std::unique_ptr<column>> input_cols;
-    input_cols.push_back(input_col.release());
-    auto input_table = std::make_unique<table>(std::move(input_cols));
-    EXPECT_EQ(1, input_table->num_columns());
+    table_view input_table({input_col});
 
     cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table->view());
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table);
     cudf_io::write_orc(out_opts);
 
     auto begin_sequence = sequence, end_sequence = sequence;
@@ -180,9 +177,7 @@ struct SkipRowTest {
       begin_sequence, end_sequence, validity);
     std::vector<std::unique_ptr<column>> output_cols;
     output_cols.push_back(output_col.release());
-    auto expected = std::make_unique<table>(std::move(output_cols));
-    EXPECT_EQ(1, expected->num_columns());
-    return expected;
+    return std::make_unique<table>(std::move(output_cols));
   }
 
   void test(int skip_rows, int file_num_rows, int read_num_rows)
@@ -224,22 +219,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
@@ -250,22 +241,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
@@ -277,15 +264,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -294,7 +277,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
       .timestamp_type(this->type());
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
@@ -307,15 +290,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -324,12 +303,12 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
       .timestamp_type(this->type());
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, MultiColumn)
 {
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 10;
 
   auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -351,29 +330,29 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
   column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal");
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  cols.push_back(col6.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(7, expected->num_columns());
+  cudf::test::lists_column_wrapper<int64_t> col7{
+    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
+
+  auto child_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
+  auto col8 = cudf::test::structs_column_wrapper{child_col};
+
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal");
+  expected_metadata.column_metadata[7].set_name("lists");
+  expected_metadata.column_metadata[8].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -381,13 +360,13 @@ TEST_F(OrcWriterTest, MultiColumn)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, MultiColumnWithNulls)
 {
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 10;
 
   auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -402,14 +381,14 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); });
   auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); });
   auto col5_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); });
   auto col6_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
 
@@ -420,30 +399,28 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
   column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
-
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal");
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  cols.push_back(col6.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(7, expected->num_columns());
+  cudf::test::lists_column_wrapper<int32_t> col7{
+    {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
+    col0_mask};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}};
+  auto col8 = cudf::test::structs_column_wrapper{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal");
+  expected_metadata.column_metadata[7].set_name("lists");
+  expected_metadata.column_metadata[8].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -451,8 +428,8 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, ReadZeroRows)
@@ -463,15 +440,11 @@ TEST_F(OrcWriterTest, ReadZeroRows)
   constexpr auto num_rows = 10;
   column_wrapper<int64_t, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -498,21 +471,16 @@ TEST_F(OrcWriterTest, Strings)
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
+  table_view expected({col0, col1, col2});
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(3, expected->num_columns());
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
 
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -520,8 +488,8 @@ TEST_F(OrcWriterTest, Strings)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, SlicedTable)
@@ -545,21 +513,24 @@ TEST_F(OrcWriterTest, SlicedTable)
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
   column_wrapper<float> col3{seq_col3, seq_col3 + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-  expected_metadata.column_names.emplace_back("col_decimal");
+  using lcw = cudf::test::lists_column_wrapper<int64_t>;
+  lcw col4{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(4, expected->num_columns());
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int16_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
+  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
 
-  auto expected_slice = cudf::slice(expected->view(), {2, static_cast<cudf::size_type>(num_rows)});
+  table_view expected({col0, col1, col2, col3, col4, col5});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+  expected_metadata.column_metadata[3].set_name("col_decimal");
+  expected_metadata.column_metadata[4].set_name("lists");
+  expected_metadata.column_metadata[5].set_name("structs");
+
+  auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows)});
 
   auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
   cudf_io::orc_writer_options out_opts =
@@ -572,7 +543,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, HostBuffer)
@@ -583,17 +554,14 @@ TEST_F(OrcWriterTest, HostBuffer)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
+  table_view expected{{col}};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  const auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
 
   std::vector<char> out_buffer;
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -602,8 +570,8 @@ TEST_F(OrcWriterTest, HostBuffer)
       .use_index(false);
   const auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, negTimestampsNano)
@@ -618,15 +586,11 @@ TEST_F(OrcWriterTest, negTimestampsNano)
     -1530705634500000000,
     -1674638741932929000,
   };
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(timestamps_ns.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({timestamps_ns});
 
   auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
 
   cudf_io::write_orc(out_opts);
 
@@ -634,10 +598,9 @@ TEST_F(OrcWriterTest, negTimestampsNano)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view().column(0),
-                                 result.tbl->view().column(0),
-                                 cudf::test::debug_output_level::ALL_ERRORS);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, Slice)
@@ -747,21 +710,51 @@ TEST_F(OrcChunkedWriterTest, ManyTables)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
-TEST_F(OrcChunkedWriterTest, Strings)
+TEST_F(OrcChunkedWriterTest, Metadata)
 {
-  std::vector<std::unique_ptr<cudf::column>> cols;
+  std::vector<const char*> strings{
+    "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"};
+  const auto num_rows = strings.size();
+
+  auto seq_col0 = random_values<int>(num_rows);
+  auto seq_col2 = random_values<float>(num_rows);
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
+  column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
+  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
+  table_view expected({col0, col1, col2});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
+  cudf_io::chunked_orc_writer_options opts =
+    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath})
+      .metadata(&expected_metadata);
+  cudf_io::orc_chunked_writer(opts).write(expected).write(expected);
+
+  cudf_io::orc_reader_options read_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_orc(read_opts);
+
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcChunkedWriterTest, Strings)
+{
   bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
-  cols.push_back(strings1.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({strings1});
 
   bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
-  cols.push_back(strings2.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({strings2});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -864,7 +857,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   using T = TypeParam;
 
   int num_els = 31;
-  std::vector<std::unique_ptr<cudf::column>> cols;
 
   bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -875,9 +867,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   std::fill(c1b, c1b + num_els, static_cast<T>(6));
   column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
   column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  cols.push_back(c1a_w.release());
-  cols.push_back(c1b_w.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({c1a_w, c1b_w});
 
   T c2a[num_els];
   std::fill(c2a, c2a + num_els, static_cast<T>(8));
@@ -885,9 +875,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   std::fill(c2b, c2b + num_els, static_cast<T>(9));
   column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
   column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  cols.push_back(c2a_w.release());
-  cols.push_back(c2b_w.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -911,7 +899,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   using T = TypeParam;
 
   int num_els = 33;
-  std::vector<std::unique_ptr<cudf::column>> cols;
 
   bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -922,9 +909,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   std::fill(c1b, c1b + num_els, static_cast<T>(6));
   column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
   column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  cols.push_back(c1a_w.release());
-  cols.push_back(c1b_w.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({c1a_w, c1b_w});
 
   T c2a[num_els];
   std::fill(c2a, c2a + num_els, static_cast<T>(8));
@@ -932,9 +917,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   std::fill(c2b, c2b + num_els, static_cast<T>(9));
   column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
   column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  cols.push_back(c2a_w.release());
-  cols.push_back(c2b_w.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -981,18 +964,12 @@ TEST_F(OrcStatisticsTest, Basic)
     sequence, sequence + num_rows, valid_all);
   column_wrapper<cudf::timestamp_s, typename decltype(sequence)::value_type> col5(
     sequence, sequence + num_rows, validity);
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  auto expected = std::make_unique<table>(std::move(cols));
+  table_view expected({col1, col2, col3, col4, col5});
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
 
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
@@ -1056,17 +1033,14 @@ TEST_F(OrcWriterTest, SlicedValidMask)
 
   column_wrapper<cudf::string_view> col{strings.begin(), strings.end(), validity};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_string");
-
   // Bug tested here is easiest to reproduce when column_offset % 32 is 31
   std::vector<cudf::size_type> indices{31, 34};
-  std::vector<cudf::column_view> sliced_col = cudf::slice(cols[0]->view(), indices);
+  auto sliced_col = cudf::slice(static_cast<cudf::column_view>(col), indices);
   cudf::table_view tbl{sliced_col};
 
+  cudf_io::table_input_metadata expected_metadata(tbl);
+  expected_metadata.column_metadata[0].set_name("col_string");
+
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf_io::orc_writer_options out_opts =
     cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl)
@@ -1078,7 +1052,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcReaderTest, SingleInputs)
@@ -1087,9 +1061,9 @@ TEST_F(OrcReaderTest, SingleInputs)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf_io::chunked_orc_writer_options opts1 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1});
-  cudf_io::orc_chunked_writer(opts1).write(*table1);
+  cudf_io::orc_writer_options write_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
+  cudf_io::write_orc(write_opts);
 
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1}});
@@ -1106,15 +1080,19 @@ TEST_F(OrcReaderTest, MultipleInputs)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
-  auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf_io::chunked_orc_writer_options opts1 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1});
-  cudf_io::orc_chunked_writer(opts1).write(*table1);
+  auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
+  {
+    cudf_io::orc_writer_options out_opts =
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
+    cudf_io::write_orc(out_opts);
+  }
 
-  auto filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
-  cudf_io::chunked_orc_writer_options opts2 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath2});
-  cudf_io::orc_chunked_writer(opts2).write(*table2);
+  auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
+  {
+    cudf_io::orc_writer_options out_opts =
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath2}, table2->view());
+    cudf_io::write_orc(out_opts);
+  }
 
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1, filepath2}});
@@ -1139,14 +1117,11 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
   column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto tbl = std::make_unique<table>(std::move(cols));
+  cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
 
   cudf_io::write_orc(out_opts);
 
@@ -1154,7 +1129,7 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl->view().column(0), result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0));
 }
 
 INSTANTIATE_TEST_CASE_P(OrcWriterTest,
@@ -1173,14 +1148,11 @@ TEST_F(OrcWriterTest, Decimal32)
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; });
   column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
+  cudf::table_view expected({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
 
   cudf_io::write_orc(out_opts);
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 7260aa9e686..0f59b0d5e15 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -184,25 +185,6 @@ std::unique_ptr<cudf::column> make_parquet_list_col(
                offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{});
 }
 
-void compare_metadata_equality(cudf::io::table_input_metadata in_meta,
-                               cudf::io::table_metadata out_meta)
-{
-  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
-    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
-      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
-      EXPECT_EQ(out_col.children.size(), in_col.num_children());
-      for (size_t i = 0; i < out_col.children.size(); ++i) {
-        compare_names(out_col.children[i], in_col.child(i));
-      }
-    };
-
-  EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
-
-  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
-    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
-  }
-}
-
 // Base test fixture for tests
 struct ParquetWriterTest : public cudf::test::BaseFixture {
 };
@@ -444,7 +426,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiColumnWithNulls)
@@ -528,7 +510,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
   // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not
   // have any tests for it c++ or python.
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, Strings)
@@ -568,7 +550,7 @@ TEST_F(ParquetWriterTest, Strings)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, SlicedTable)
@@ -682,7 +664,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListColumn)
@@ -780,7 +762,7 @@ TEST_F(ParquetWriterTest, ListColumn)
   auto result  = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -831,7 +813,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, HostBuffer)
@@ -860,7 +842,7 @@ TEST_F(ParquetWriterTest, HostBuffer)
   const auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, NonNullable)
@@ -989,7 +971,7 @@ TEST_F(ParquetWriterTest, StructOfList)
   const auto result = cudf_io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListOfStruct)
@@ -1044,7 +1026,7 @@ TEST_F(ParquetWriterTest, ListOfStruct)
   const auto result = cudf_io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 // custom data sink that supports device writes. uses plain file io.
@@ -1433,7 +1415,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
@@ -1526,7 +1508,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 
   // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls.
   auto result_parent_list = result.tbl->get_column(0);
@@ -1697,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ForcedNullability)
@@ -1830,7 +1812,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
@@ -2552,7 +2534,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 
   {  // Test selecting a non-leaf and expecting all hierarchy from that node onwards
@@ -2581,7 +2563,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(0).child(1).set_name("age");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 
   {  // Test selecting struct children out of order
@@ -2616,7 +2598,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(1).set_name("human?");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 }
 
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 8aae523d12b..17d31c3e387 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -72,7 +72,7 @@ struct ListConcatenateRowsTypedTest : public cudf::test::BaseFixture {
 using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                         cudf::test::FloatingPointTypes,
                                         cudf::test::FixedPointTypes>;
-TYPED_TEST_CASE(ListConcatenateRowsTypedTest, TypesForTest);
+TYPED_TEST_SUITE(ListConcatenateRowsTypedTest, TypesForTest);
 
 TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateEmptyColumns)
 {
@@ -110,10 +110,12 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputNoNull)
 {
   using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
 
-  auto const col1     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
-  auto const col2     = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
-  auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release();
-  auto const results  = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  auto const col1        = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const empty_lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}.release();
+  auto const col2        = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const expected    = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release();
+  auto const results =
+    cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
@@ -121,11 +123,13 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNullableChild)
 {
   using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
 
-  auto const col1 = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release();
-  auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const col1        = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release();
+  auto const empty_lists = ListsCol{{ListsCol{}, ListsCol{}, ListsCol{}}, null_at(2)}.release();
+  auto const col2        = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
   auto const expected =
     ListsCol{{1, 2, 7, 8}, ListsCol{{null, 9, 10}, null_at(0)}, {5, 6, 11, 12}}.release();
-  auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  auto const results =
+    cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
@@ -466,3 +470,19 @@ TEST_F(ListConcatenateRowsTest, SlicedStringsColumnsInputWithNulls)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
   }
 }
+
+TEST_F(ListConcatenateRowsTest, StringsColumnsWithEmptyListTest)
+{
+  auto const col1 = StrListsCol{{"1", "2", "3", "4"}}.release();
+  auto const col2 = StrListsCol{{"a", "b", "c"}}.release();
+  auto const col3 = StrListsCol{StrListsCol{}}.release();
+  auto const col4 = StrListsCol{{"x", "y", "" /*NULL*/, "z"}, null_at(2)}.release();
+  auto const col5 = StrListsCol{{StrListsCol{}}, null_at(0)}.release();
+  auto const expected =
+    StrListsCol{{"1", "2", "3", "4", "a", "b", "c", "x", "y", "" /*NULL*/, "z"}, null_at(9)}
+      .release();
+  auto const results = cudf::lists::concatenate_rows(
+    TView{{col1->view(), col2->view(), col3->view(), col4->view(), col5->view()}});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
+}
diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
index bc413fd220a..270e01075b9 100644
--- a/cpp/tests/lists/drop_list_duplicates_tests.cpp
+++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp
@@ -14,61 +14,65 @@
  * limitations under the License.
  */
 
+#include <cudf/copying.hpp>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/drop_list_duplicates.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <algorithm>
 #include <unordered_set>
 
-using int_type   = int32_t;
-using float_type = float;
-
-using LIST_COL_FLT = cudf::test::lists_column_wrapper<float_type>;
-using LIST_COL_STR = cudf::test::lists_column_wrapper<cudf::string_view>;
+using namespace cudf::test::iterators;
 
-auto constexpr neg_NaN = -std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr neg_Inf = -std::numeric_limits<float_type>::infinity();
-auto constexpr NaN     = std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr Inf     = std::numeric_limits<float_type>::infinity();
+using float_type    = float;
+using FloatListsCol = cudf::test::lists_column_wrapper<float_type>;
+using StrListsCol   = cudf::test::lists_column_wrapper<cudf::string_view>;
+using StringsCol    = cudf::test::strings_column_wrapper;
+using StructsCol    = cudf::test::structs_column_wrapper;
+using IntsCol       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using FloatsCol     = cudf::test::fixed_width_column_wrapper<float_type>;
 
-template <class LCW>
-void test_once(cudf::column_view const& input,
-               LCW const& expected,
-               cudf::null_equality nulls_equal = cudf::null_equality::EQUAL)
-{
-  auto const results =
-    cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal);
-  if (cudf::is_floating_point(input.type())) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
-  } else {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
-  }
-}
+auto constexpr neg_NaN   = -std::numeric_limits<float_type>::quiet_NaN();
+auto constexpr neg_Inf   = -std::numeric_limits<float_type>::infinity();
+auto constexpr NaN       = std::numeric_limits<float_type>::quiet_NaN();
+auto constexpr Inf       = std::numeric_limits<float_type>::infinity();
+auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR;
 
 struct DropListDuplicatesTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero)
 {
-  // -0.0 and 0.0 should be considered equal
-  test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0},
-            LIST_COL_FLT{0, 1, 2});
+  // -0.0 and 0.0 should be considered equal.
+  auto const lists    = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0};
+  auto const expected = FloatListsCol{0, 1, 2};
+  auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
 }
 
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf)
 {
-  // Lists contain inf
-  test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf});
-  test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf},
-            LIST_COL_FLT{neg_Inf, 0, Inf});
+  // Lists contain inf.
+  {
+    auto const lists    = FloatListsCol{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf};
+    auto const expected = FloatListsCol{0, 1, 2, Inf};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+  {
+    auto const lists    = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf};
+    auto const expected = FloatListsCol{neg_Inf, 0, Inf};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to
-// check for validity
+// check for validity.
 // We will not store NaN in the results_expected variable (an unordered_set) because we can't check
 // for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare
 // with the number of NaNs in the output.
@@ -77,14 +81,14 @@ static void test_floating_point(std::vector<float_type> const& h_input,
                                 cudf::nan_equality nans_equal)
 {
   // If NaNs are considered as equal value, the final result should always contain at max ONE NaN
-  // entry per list
+  // entry per list.
   std::size_t const num_NaNs =
     nans_equal == cudf::nan_equality::ALL_EQUAL
       ? std::size_t{1}
       : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); });
 
   auto const results_col = cudf::lists::drop_list_duplicates(
-    cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())},
+    cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())},
     cudf::null_equality::EQUAL,
     nans_equal);
   auto const results_arr =
@@ -125,130 +129,479 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs)
 
 TEST_F(DropListDuplicatesTest, StringTestsNonNull)
 {
-  // Trivial cases
-  test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}});
-  test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"});
-
-  // One list column
-  test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"},
-            LIST_COL_STR{"a", "is", "string", "this"});
-
-  // Multiple lists column
-  test_once(
-    LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"},
-                 LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"},
-                 LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"},
-                 LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}},
-    LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"},
-                 LIST_COL_STR{"a", "is", "one duplicate", "string", "this"},
-                 LIST_COL_STR{"a", "is", "string", "this", "two duplicates"},
-                 LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}});
+  // Trivial cases - empty input.
+  {
+    auto const lists    = StrListsCol{{}};
+    auto const expected = StrListsCol{{}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // No duplicate entry.
+  {
+    auto const lists    = StrListsCol{"this", "is", "a", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // One list column.
+  {
+    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // One list column, input is a strings column with given non-default null_equality and
+  // nans_equality parameters.
+  {
+    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // Multiple lists column.
+  {
+    auto const lists =
+      StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"},
+                  StrListsCol{"this", "is", "is", "a", "one duplicate", "string"},
+                  StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"},
+                  StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}};
+    auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"},
+                                      StrListsCol{"a", "is", "one duplicate", "string", "this"},
+                                      StrListsCol{"a", "is", "string", "this", "two duplicates"},
+                                      StrListsCol{"a", "is", "string", "this", "three duplicates"}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TEST_F(DropListDuplicatesTest, StringTestsWithNulls)
 {
   auto const null = std::string("");
 
-  // One list column with null entries
-  test_once(
-    LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"},
-                 cudf::detail::make_counting_transform_iterator(
-                   0, [](auto i) { return i != 1 && i != 6 && i != 8; })},
-    LIST_COL_STR{{"a", "is", "string", "this", null},
-                 cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })});
+  // One list column with null entries.
+  {
+    auto const lists = StrListsCol{
+      {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})};
+    auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
   // Multiple lists column with null lists and null entries
-  test_once(
-    LIST_COL_STR{
-      {LIST_COL_STR{
-         {"this", null, "is", null, "a", null, "no duplicate", null, "string"},
-         cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })},
-       LIST_COL_STR{},
-       LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}},
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })},
-    LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null},
-                               cudf::detail::make_counting_transform_iterator(
-                                 0, [](auto i) { return i <= 4; })},
-                  LIST_COL_STR{},
-                  LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}},
-                 cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })});
+  {
+    auto const lists = StrListsCol{
+      {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"},
+                   nulls_at({1, 3, 5, 7})},
+       StrListsCol{}, /* NULL */
+       StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}},
+      null_at(1)};
+    auto const expected =
+      StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)},
+                   StrListsCol{}, /* NULL */
+                   StrListsCol{"a", "is", "one duplicate", "string", "this"}},
+                  null_at(1)};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 template <typename T>
 struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture {
 };
-#define LIST_COL cudf::test::lists_column_wrapper<TypeParam>
 
 using TypesForTest =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
-TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest);
+TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest);
 
 TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests)
 {
-  // Lists of nested types are not supported
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Nested types (except struct) are not supported.
   EXPECT_THROW(
-    cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}),
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}),
     cudf::logic_error);
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
 {
-  // Empty input
-  test_once(LIST_COL{{}}, LIST_COL{{}});
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Empty input.
+  {
+    auto const lists    = ListsCol{{}};
+    auto const expected = ListsCol{{}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
-  // Trivial cases
-  test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5});
+  // Trivial cases.
+  {
+    auto const lists    = ListsCol{0, 1, 2, 3, 4, 5};
+    auto const expected = ListsCol{0, 1, 2, 3, 4, 5};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
-  // Multiple empty lists
-  test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}},
-            LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}});
+  // Multiple empty lists.
+  {
+    auto const lists    = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}};
+    auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests)
 {
-  // Adjacent lists containing the same entries
-  test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}},
-            LIST_COL{{1}, {1, 2}, {2, 3}});
-
-  // Sliced list column
-  auto const list0 =
-    LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
-  auto const list1 = cudf::slice(list0, {0, 5})[0];
-  auto const list2 = cudf::slice(list0, {1, 5})[0];
-  auto const list3 = cudf::slice(list0, {1, 3})[0];
-  auto const list4 = cudf::slice(list0, {0, 3})[0];
-
-  test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}});
-  test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}});
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Adjacent lists containing the same entries.
+  {
+    auto const lists =
+      ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}};
+    auto const expected = ListsCol{{1}, {1, 2}, {2, 3}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // Sliced list column.
+  auto const lists_original =
+    ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
+  auto const lists1 = cudf::slice(lists_original, {0, 5})[0];
+  auto const lists2 = cudf::slice(lists_original, {1, 5})[0];
+  auto const lists3 = cudf::slice(lists_original, {1, 3})[0];
+  auto const lists4 = cudf::slice(lists_original, {0, 3})[0];
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3, 4}, {5}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests)
 {
+  using ListsCol      = cudf::test::lists_column_wrapper<TypeParam>;
   auto constexpr null = TypeParam{0};
 
-  // null lists
-  test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}},
-                     cudf::detail::make_counting_transform_iterator(
-                       0, [](auto i) { return i != 2 && i != 3; })},
-            LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}},
-                     cudf::detail::make_counting_transform_iterator(
-                       0, [](auto i) { return i != 2 && i != 3; })});
-
-  // null entries are equal
-  test_once(
-    LIST_COL{std::initializer_list<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
-    LIST_COL{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })});
-
-  // nulls entries are not equal
-  test_once(
-    LIST_COL{std::initializer_list<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
-    LIST_COL{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null, null, null, null, null},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })},
-    cudf::null_equality::UNEQUAL);
+  // null lists.
+  {
+    auto const lists = ListsCol{
+      {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})};
+    auto const expected =
+      ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // null entries are equal.
+  {
+    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
+    auto const expected =
+      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null}, null_at(5)};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // nulls entries are not equal.
+  {
+    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
+    auto const expected =
+      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null, null, null, null, null},
+               nulls_at({5, 6, 7, 8, 9})};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists},
+                                                           cudf::null_equality::UNEQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const get_structs = [] {
+    auto child1 = ColWrapper{
+      1, 1, 1, 1, 1, 1, 1, 1,  // list1
+      1, 1, 1, 1, 2, 1, 2, 2,  // list2
+      2, 2, 2, 2, 3, 2, 3, 3   // list3
+    };
+    auto child2 = StringsCol{
+      // begin list1
+      "Banana",
+      "Mango",
+      "Apple",
+      "Cherry",
+      "Kiwi",
+      "Banana",
+      "Cherry",
+      "Kiwi",  // end list1
+      // begin list2
+      "Bear",
+      "Duck",
+      "Cat",
+      "Dog",
+      "Panda",
+      "Bear",
+      "Cat",
+      "Panda",  // end list2
+      // begin list3
+      "ÁÁÁ",
+      "ÉÉÉÉÉ",
+      "ÍÍÍÍÍ",
+      "ÁBC",
+      "XYZ",
+      "ÁÁÁ",
+      "ÁBC",
+      "XYZ"  // end list3
+    };
+    return StructsCol{{child1, child2}};
+  };
+
+  auto const get_structs_expected = [] {
+    auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3};
+    auto child2 = StringsCol{
+      // begin list1
+      "Apple",
+      "Banana",
+      "Cherry",
+      "Kiwi",
+      "Mango",  // end list1
+      // begin list2
+      "Bear",
+      "Cat",
+      "Dog",
+      "Duck",
+      "Cat",
+      "Panda",  // end list2
+      // begin list3
+      "ÁBC",
+      "ÁÁÁ",
+      "ÉÉÉÉÉ",
+      "ÍÍÍÍÍ",
+      "XYZ",
+      "ÁBC"  // end list3
+    };
+    return StructsCol{{child1, child2}};
+  };
+
+  // Test full columns.
+  {
+    auto const lists =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected = cudf::make_lists_column(
+      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
+  }
+
+  // Test sliced columns.
+  {
+    auto const lists_original =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected_original = cudf::make_lists_column(
+      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
+    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
+    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull)
+{
+  using ColWrapper    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  auto constexpr XXX  = int32_t{0};  // nulls at the parent structs column level
+  auto constexpr null = int32_t{0};  // nulls at the children columns level
+
+  auto const get_structs = [] {
+    auto child1 = ColWrapper{{
+                               1,    1,    null, XXX, XXX, 1, 1,    1,  // list1
+                               1,    1,    1,    1,   2,   1, null, 2,  // list2
+                               null, null, 2,    2,   3,   2, 3,    3   // list3
+                             },
+                             nulls_at({2, 14, 16, 17})};
+    auto child2 = StringsCol{{
+                               // begin list1
+                               "Banana",
+                               "Mango",
+                               "Apple",
+                               "XXX", /*NULL*/
+                               "XXX", /*NULL*/
+                               "Banana",
+                               "Cherry",
+                               "Kiwi",  // end list1
+                                        // begin list2
+                               "Bear",
+                               "Duck",
+                               "Cat",
+                               "Dog",
+                               "Panda",
+                               "Bear",
+                               "" /*NULL*/,
+                               "Panda",  // end list2
+                                         // begin list3
+                               "ÁÁÁ",
+                               "ÉÉÉÉÉ",
+                               "ÍÍÍÍÍ",
+                               "ÁBC",
+                               "" /*NULL*/,
+                               "ÁÁÁ",
+                               "ÁBC",
+                               "XYZ"  // end list3
+                             },
+                             nulls_at({14, 20})};
+    return StructsCol{{child1, child2}, nulls_at({3, 4})};
+  };
+
+  auto const get_structs_expected = [] {
+    auto child1 =
+      ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
+                 nulls_at({4, 5, 11, 18, 19})};
+    auto child2 = StringsCol{{
+                               // begin list1
+                               "Banana",
+                               "Cherry",
+                               "Kiwi",
+                               "Mango",
+                               "Apple",
+                               "XXX" /*NULL*/,  // end list1
+                                                // begin list2
+                               "Bear",
+                               "Cat",
+                               "Dog",
+                               "Duck",
+                               "Panda",
+                               "" /*NULL*/,  // end list2
+                                             // begin list3
+                               "ÁBC",
+                               "ÁÁÁ",
+                               "ÍÍÍÍÍ",
+                               "XYZ",
+                               "ÁBC",
+                               "" /*NULL*/,
+                               "ÁÁÁ",
+                               "ÉÉÉÉÉ"  // end list3
+                             },
+                             nulls_at({5, 11, 17})};
+    return StructsCol{{child1, child2}, null_at(5)};
+  };
+
+  // Test full columns.
+  {
+    auto const lists =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected = cudf::make_lists_column(
+      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
+  }
+
+  // Test sliced columns.
+  {
+    auto const lists_original =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected_original = cudf::make_lists_column(
+      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
+    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
+    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
+{
+  auto const h_child = std::vector<float_type>{
+    0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN};
+
+  auto const get_structs = [&] {
+    // Two children are just identical.
+    auto child1 = FloatsCol(h_child.begin(), h_child.end());
+    auto child2 = FloatsCol(h_child.begin(), h_child.end());
+    return StructsCol{{child1, child2}};
+  };
+
+  // The first list does not have any NaN or -NaN, while the second list has both.
+  // `drop_list_duplicates` is expected to operate properly on this second list.
+  auto const lists_original =
+    cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {});
+  auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0];  // test on the second list
+
+  // Contain expected values excluding NaN.
+  auto const results_children_expected = std::unordered_set<float_type>{0, 1, 2};
+
+  // Test for cudf::nan_equality::UNEQUAL.
+  {
+    auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
+    auto const child       = cudf::lists_column_view(results_col->view()).child();
+    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
+
+    std::size_t const num_NaNs =
+      std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); });
+    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
+
+    std::size_t NaN_count{0};
+    std::unordered_set<float_type> results;
+    for (auto const x : results_arr) {
+      if (std::isnan(x)) {
+        ++NaN_count;
+      } else {
+        results.insert(x);
+      }
+    }
+    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
+  }
+
+  // Test for cudf::nan_equality::ALL_EQUAL.
+  {
+    auto const results_col = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
+    auto const child       = cudf::lists_column_view(results_col->view()).child();
+    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
+
+    std::size_t const num_NaNs = 1;
+    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
+
+    std::size_t NaN_count{0};
+    std::unordered_set<float_type> results;
+    for (auto const x : results_arr) {
+      if (std::isnan(x)) {
+        ++NaN_count;
+      } else {
+        results.insert(x);
+      }
+    }
+    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
+  }
 }
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu
new file mode 100644
index 00000000000..39f7cc593d6
--- /dev/null
+++ b/cpp/tests/quantiles/percentile_approx_test.cu
@@ -0,0 +1,435 @@
+#include <arrow/util/tdigest.h>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/quantiles.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+using namespace cudf;
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
+                                                int delta,
+                                                std::vector<double> const& percentages)
+{
+  // sort the incoming values using the same settings that groupby does.
+  // this is a little weak because null_order::AFTER is hardcoded internally to groupby.
+  table_view t({_values});
+  auto sorted_t      = cudf::sort(t, {}, {null_order::AFTER});
+  auto sorted_values = sorted_t->get_column(0).view();
+
+  std::vector<double> h_values(sorted_values.size());
+  cudaMemcpy(h_values.data(),
+             sorted_values.data<double>(),
+             sizeof(double) * sorted_values.size(),
+             cudaMemcpyDeviceToHost);
+  std::vector<char> h_validity(sorted_values.size());
+  if (sorted_values.null_mask() != nullptr) {
+    auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size());
+    cudaMemcpy(h_validity.data(),
+               (validity->view().data<char>()),
+               sizeof(char) * sorted_values.size(),
+               cudaMemcpyDeviceToHost);
+  }
+
+  // generate the tdigest
+  arrow::internal::TDigest atd(delta, sorted_values.size() * 2);
+  for (size_t idx = 0; idx < h_values.size(); idx++) {
+    if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); }
+  }
+
+  // generate the percentiles and stuff them into a list column
+  std::vector<double> h_result;
+  h_result.reserve(percentages.size());
+  std::transform(
+    percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) {
+      return atd.Quantile(p);
+    });
+  cudf::test::fixed_width_column_wrapper<double> result(h_result.begin(), h_result.end());
+  cudf::test::fixed_width_column_wrapper<size_type> offsets{
+    0, static_cast<size_type>(percentages.size())};
+  return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {});
+}
+
+struct percentile_approx_dispatch {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    // arrow implementation.
+    auto expected = [&]() {
+      // we're explicitly casting back to doubles here but this is ok because that is
+      // exactly what happens inside of the cudf implementation as values are processed as well. so
+      // this should not affect results.
+      auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64});
+      return arrow_percentile_approx(*as_doubles, delta, percentages);
+    }();
+
+    // gpu
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto gb_result = gb.aggregate(requests);
+
+    cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                                 percentages.end());
+    structs_column_view scv(*(gb_result.second[0].results[0]));
+    auto result = cudf::percentile_approx(scv, g_percentages);
+
+    cudf::test::expect_columns_equivalent(
+      *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps);
+
+    return result;
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    CUDF_FAIL("Invalid input type for percentile_approx test");
+  }
+};
+
+void percentile_approx_test(column_view const& _keys,
+                            column_view const& _values,
+                            int delta,
+                            std::vector<double> const& percentages,
+                            size_type ulps)
+{
+  // first pass:  validate the actual percentages we get per group.
+
+  // produce the groups
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby pass1_gb(k);
+  cudf::table_view v({_values});
+  auto groups = pass1_gb.get_groups(v);
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  std::vector<std::unique_ptr<column>> parts;
+  for (size_t idx = 0; idx < values.size(); idx++) {
+    // do any casting of the input
+    parts.push_back(cudf::type_dispatcher(values[idx].type(),
+                                          percentile_approx_dispatch{},
+                                          keys[idx],
+                                          values[idx],
+                                          delta,
+                                          percentages,
+                                          ulps));
+  }
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& c) { return c->view(); });
+  auto expected = cudf::concatenate(part_views);
+
+  // second pass. run the percentile_approx with all the keys in one pass and make sure we get the
+  // same results as the concatenated by-key results above
+
+  cudf::groupby::groupby gb(k);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({_values, std::move(aggregations)});
+  auto gb_result = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                               percentages.end());
+  structs_column_view scv(*(gb_result.second[0].results[0]));
+  auto result = cudf::percentile_approx(scv, g_percentages);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+struct group_index {
+  __device__ int operator()(int i) { return i / 150000; }
+};
+
+void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+std::pair<rmm::device_buffer, size_type> make_null_mask(column_view const& col)
+{
+  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                thrust::make_counting_iterator<size_type>(col.size()),
+                                [] __device__(size_type i) { return i % 2 == 0; });
+}
+
+void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+template <typename T>
+data_type get_appropriate_type()
+{
+  if constexpr (cudf::is_fixed_point<T>()) { return data_type{cudf::type_to_id<T>(), -7}; }
+  return data_type{cudf::type_to_id<T>()};
+}
+
+using PercentileApproxTypes =
+  cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
+
+template <typename T>
+struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes);
+
+TYPED_TEST(PercentileApproxInputTypesTest, Simple)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_test(input_type,
+              {{1000, cudf::test::default_ulp},
+               {100, cudf::test::default_ulp * 4},
+               {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, Grouped)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_test(input_type,
+               {{1000, cudf::test::default_ulp},
+                {100, cudf::test::default_ulp * 2},
+                {10, cudf::test::default_ulp * 10}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, SimpleWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_with_nulls_test(input_type,
+                         {{1000, cudf::test::default_ulp},
+                          {100, cudf::test::default_ulp * 2},
+                          {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, GroupedWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_with_nulls_test(input_type,
+                          {{1000, cudf::test::default_ulp},
+                           {100, cudf::test::default_ulp * 2},
+                           {10, cudf::test::default_ulp * 6}});
+}
+
+struct PercentileApproxTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(PercentileApproxTest, EmptyInput)
+{
+  auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column();
+  cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
+
+  std::vector<column_view> input;
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  auto empty = cudf::concatenate(input);
+
+  structs_column_view scv(*empty);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
+  std::vector<bool> nulls{0, 0, 0};
+  auto expected =
+    cudf::make_lists_column(3,
+                            offsets.release(),
+                            cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                            3,
+                            cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, EmptyPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{0, 1, 2, 3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> percentiles{};
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
+  auto expected = cudf::make_lists_column(2,
+                                          offsets.release(),
+                                          cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                                          2,
+                                          cudf::detail::create_null_mask(2, mask_state::ALL_NULL));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, NullPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{1, 1, 2, 3, 4, 5, 6, 7, 8};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 1, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+
+  cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
+  auto result = cudf::percentile_approx(scv, npercentiles);
+
+  std::vector<bool> valids{0, 0, 1, 1};
+  cudf::test::lists_column_wrapper<double> expected{{{99, 99, 4, 4}, valids.begin()},
+                                                    {{99, 99, 8, 8}, valids.begin()}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+}
\ No newline at end of file
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index c26059ee09b..5631c910753 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -2168,34 +2168,45 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
                                       result_with_nan_equal->view());
 }
 
-TEST_F(CollectSetTest, ListTypeRollingWindow)
+TEST_F(CollectSetTest, StructTypeRollingWindow)
 {
   using namespace cudf;
   using namespace cudf::test;
 
-  auto const input_column = lists_column_wrapper<int32_t>{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}};
-
-  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
-  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+  auto col1               = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
+  auto col2               = strings_column_wrapper{"a", "b", "c", "d", "e"};
+  auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}};
+  auto const prev_column  = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column  = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
 
-  EXPECT_THROW(rolling_window(input_column,
-                              prev_column,
-                              foll_column,
-                              1,
-                              *make_collect_set_aggregation<rolling_aggregation>()),
-               cudf::logic_error);
+  auto const expected = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5};
+    auto child2 =
+      strings_column_wrapper{"a", "b", "a", "b", "c", "b", "c", "d", "c", "d", "e", "d", "e"};
+    return cudf::make_lists_column(
+      5,
+      fixed_width_column_wrapper<size_type>{0, 2, 5, 8, 11, 13}.release(),
+      structs_column_wrapper{{child1, child2}}.release(),
+      0,
+      {});
+  }();
+  auto const result = rolling_window(input_column,
+                                     prev_column,
+                                     foll_column,
+                                     1,
+                                     *make_collect_set_aggregation<rolling_aggregation>());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected->view(), result->view());
 }
 
-TEST_F(CollectSetTest, StructTypeRollingWindow)
+TEST_F(CollectSetTest, ListTypeRollingWindow)
 {
   using namespace cudf;
   using namespace cudf::test;
 
-  auto col1               = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
-  auto col2               = strings_column_wrapper{"a", "b", "c", "d", "e"};
-  auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}};
-  auto const prev_column  = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
-  auto const foll_column  = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+  auto const input_column = lists_column_wrapper<int32_t>{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
 
   EXPECT_THROW(rolling_window(input_column,
                               prev_column,
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index f3002bc4b1a..0f10d6efe4a 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -323,7 +323,8 @@ class corresponding_rows_unequal {
   corresponding_rows_unequal(table_device_view d_lhs,
                              table_device_view d_rhs,
                              column_device_view lhs_row_indices_,
-                             column_device_view rhs_row_indices_)
+                             column_device_view rhs_row_indices_,
+                             size_type /*fp_ulps*/)
     : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_)
   {
   }
@@ -347,16 +348,20 @@ class corresponding_rows_not_equivalent {
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
 
+  size_type const fp_ulps;
+
  public:
   corresponding_rows_not_equivalent(table_device_view d_lhs,
                                     table_device_view d_rhs,
                                     column_device_view lhs_row_indices_,
-                                    column_device_view rhs_row_indices_)
+                                    column_device_view rhs_row_indices_,
+                                    size_type fp_ulps_)
     : d_lhs(d_lhs),
       d_rhs(d_rhs),
       comp(d_lhs, d_rhs),
       lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+      rhs_row_indices(rhs_row_indices_),
+      fp_ulps(fp_ulps_)
   {
     CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
                  "Unsupported number of columns");
@@ -368,7 +373,8 @@ class corresponding_rows_not_equivalent {
       column_device_view const& lhs,
       column_device_view const& rhs,
       size_type lhs_index,
-      size_type rhs_index)
+      size_type rhs_index,
+      size_type fp_ulps)
     {
       if (lhs.is_valid(lhs_index) and rhs.is_valid(rhs_index)) {
         T const x = lhs.element<T>(lhs_index);
@@ -380,10 +386,9 @@ class corresponding_rows_not_equivalent {
         } else if (std::isnan(x) || std::isnan(y)) {
           return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
         } else {
-          constexpr int ulp     = 4;  // ulp = unit of least precision, value taken from google test
           T const abs_x_minus_y = std::abs(x - y);
           return abs_x_minus_y >= std::numeric_limits<T>::min() &&
-                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp;
+                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * fp_ulps;
         }
       } else {
         // if either is null, then the inequality was checked already
@@ -409,8 +414,13 @@ class corresponding_rows_not_equivalent {
     if (not comp(lhs_index, rhs_index)) {
       auto lhs_col = this->d_lhs.column(0);
       auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(
-        lhs_col.type(), typed_element_not_equivalent{}, lhs_col, rhs_col, lhs_index, rhs_index);
+      return type_dispatcher(lhs_col.type(),
+                             typed_element_not_equivalent{},
+                             lhs_col,
+                             rhs_col,
+                             lhs_index,
+                             rhs_index,
+                             fp_ulps);
     }
     return false;
   }
@@ -468,6 +478,7 @@ struct column_comparator_impl {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
@@ -483,12 +494,12 @@ struct column_comparator_impl {
     auto differences = rmm::device_uvector<int>(
       lhs.size(), rmm::cuda_stream_default);  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter =
-      thrust::copy_if(rmm::exec_policy(),
-                      input_iter,
-                      input_iter + lhs_row_indices.size(),
-                      differences.begin(),
-                      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices));
+    auto diff_iter  = thrust::copy_if(
+      rmm::exec_policy(),
+      input_iter,
+      input_iter + lhs_row_indices.size(),
+      differences.begin(),
+      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        rmm::cuda_stream_default);  // shrink back down
@@ -519,6 +530,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     lists_column_view lhs_l(lhs);
@@ -638,6 +650,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                                    *lhs_child_indices,
                                    *rhs_child_indices,
                                    verbosity,
+                                   fp_ulps,
                                    depth + 1);
     }
 
@@ -652,6 +665,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     structs_column_view l_scv(lhs);
@@ -667,6 +681,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                                  lhs_row_indices,
                                  rhs_row_indices,
                                  verbosity,
+                                 fp_ulps,
                                  depth + 1)) {
         return false;
       }
@@ -683,6 +698,7 @@ struct column_comparator {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth = 0)
   {
     CUDF_EXPECTS(lhs_row_indices.size() == rhs_row_indices.size(),
@@ -701,7 +717,7 @@ struct column_comparator {
 
     // compare values
     column_comparator_impl<T, check_exact_equality> comparator{};
-    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, depth);
+    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, fp_ulps, depth);
   }
 };
 
@@ -750,8 +766,14 @@ bool expect_columns_equal(cudf::column_view const& lhs,
                           debug_output_level verbosity)
 {
   auto indices = generate_all_row_indices(lhs.size());
-  return cudf::type_dispatcher(
-    lhs.type(), column_comparator<true>{}, lhs, rhs, *indices, *indices, verbosity);
+  return cudf::type_dispatcher(lhs.type(),
+                               column_comparator<true>{},
+                               lhs,
+                               rhs,
+                               *indices,
+                               *indices,
+                               verbosity,
+                               cudf::test::default_ulp);
 }
 
 /**
@@ -759,11 +781,12 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity)
+                               debug_output_level verbosity,
+                               size_type fp_ulps)
 {
   auto indices = generate_all_row_indices(lhs.size());
   return cudf::type_dispatcher(
-    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity);
+    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity, fp_ulps);
 }
 
 /**
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c5f1233d022..4a7d115ae3b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '21.10'
+version = '21.12'
 # The full version, including alpha/beta/rc tags.
-release = '21.10.00'
+release = '21.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/java/ci/README.md b/java/ci/README.md
index ef3a329f7f6..5432dc8d0f1 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.10
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.12
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-21.10.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-21.12.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/pom.xml b/java/pom.xml
index 1b4a31116d4..db79f94009b 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>21.10.0-SNAPSHOT</version>
+    <version>21.12.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
index 238e0b61fd9..85443c3ae0f 100644
--- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 
 public class ORCWriterOptions extends CompressedMetadataWriterOptions {
 
-  public static ORCWriterOptions DEFAULT = new ORCWriterOptions(new Builder());
-
   private ORCWriterOptions(Builder builder) {
     super(builder);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 2744728fb44..0af02d1c926 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1147,7 +1147,11 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo
    */
   @Deprecated
   public void writeORC(File outputFile) {
-    writeORC(ORCWriterOptions.DEFAULT, outputFile);
+    // Need to specify the number of columns but leave all column names undefined
+    String[] names = new String[getNumberOfColumns()];
+    Arrays.fill(names, "");
+    ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(names).build();
+    writeORC(opts, outputFile);
   }
 
   /**
@@ -1157,6 +1161,7 @@ public void writeORC(File outputFile) {
    */
   @Deprecated
   public void writeORC(ORCWriterOptions options, File outputFile) {
+    assert options.getColumnNames().length == getNumberOfColumns() : "must specify names for all columns";
     try (TableWriter writer = Table.writeORCChunked(options, outputFile)) {
       writer.write(this);
     }
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index fc74ee2a3a9..2c95c6eebac 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
      ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 
@@ -29,7 +29,7 @@ if(DEFINED GPU_ARCHS)
 endif()
 rapids_cuda_init_architectures(CUDF_JNI)
 
-project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX CUDA)
+project(CUDF_JNI VERSION 21.12.00 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - build options ---------------------------------------------------------------------------------
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 96dd02e5f2a..ee75112a2ed 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -736,6 +736,29 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   }
 }
 
+cudf::io::table_input_metadata createORCTableInputMetadata(JNIEnv *env,
+                                                           jobjectArray const &j_col_names,
+                                                           jbooleanArray const &j_col_nullability,
+                                                           jobjectArray const &j_metadata_keys,
+                                                           jobjectArray const &j_metadata_values) {
+  cudf::jni::native_jstringArray const col_names(env, j_col_names);
+  cudf::jni::native_jbooleanArray const col_nullability(env, j_col_nullability);
+  cudf::jni::native_jstringArray const meta_keys(env, j_metadata_keys);
+  cudf::jni::native_jstringArray const meta_values(env, j_metadata_values);
+
+  std::vector<std::string> const cpp_names = col_names.as_cpp_vector();
+  std::size_t const num_columns = cpp_names.size();
+  cudf::io::table_input_metadata metadata;
+  metadata.column_metadata.resize(cpp_names.size());
+  for (std::size_t i = 0; i < num_columns; i++) {
+    metadata.column_metadata[i].set_name(cpp_names[i]).set_nullability(col_nullability[i]);
+  }
+  for (int i = 0; i < meta_keys.size(); ++i) {
+    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
+  }
+  return metadata;
+}
+
 // Check that window parameters are valid.
 bool valid_window_parameters(native_jintArray const &values,
                              native_jpointerArray<cudf::aggregation> const &ops,
@@ -1500,19 +1523,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
-
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
+    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
+        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1542,20 +1554,10 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
     cudf::jni::native_jstring output_path(env, j_output_path);
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
+    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
+        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
@@ -1577,7 +1579,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view *tview_orig = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig);
   cudf::jni::native_orc_writer_handle *state =
       reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
 
@@ -1587,7 +1590,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->writer->write(*tview);
+    state->writer->write(tview);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index ad791747713..683651799e7 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -183,6 +183,10 @@ std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar
   // Defensive checks.
   map_input_check(map_column, stream);
 
+  if (map_column.size() == 0) {
+    return make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
+
   lists_column_view lcv{map_column};
   column_view structs_column = lcv.get_sliced_child(stream);
   // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A.
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0643776a546..d1af0d9a2f6 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5412,6 +5412,17 @@ void testGetMapValue() {
     }
   }
 
+  @Test
+  void testGetMapValueEmptyInput() {
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+        new HostColumnVector.BasicType(true, DType.STRING)));
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType));
+         ColumnVector res = cv.getMapValue(Scalar.fromString("a"));
+         ColumnVector expected = ColumnVector.fromStrings()) {
+      assertColumnsAreEqual(expected, res);
+    }
+  }
+
   @Test
   void testGetMapKeyExistence() {
     List<HostColumnVector.StructData> list1 = Arrays.asList(new HostColumnVector.StructData("a", "b"));
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index cd1e433d07b..0e7ac15a79e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6669,8 +6669,9 @@ void testParquetWriteMap() throws IOException {
     HostColumnVector.StructType structType = new HostColumnVector.StructType(true,
      Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
         new HostColumnVector.BasicType(true, DType.STRING)));
-    try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true,
-     structType), list1, list2, list3))) {
+    try (ColumnVector listColumn = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            structType), list1, list2, list3);
+         Table t0 = new Table(listColumn)) {
       try (TableWriter writer = Table.writeParquetChunked(options, f)) {
         writer.write(t0);
       }
@@ -6875,7 +6876,10 @@ void testArrowIPCWriteToBufferChunked() {
   void testORCWriteToBufferChunked() {
     try (Table table0 = getExpectedFileTable();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-      try (TableWriter writer = Table.writeORCChunked(ORCWriterOptions.DEFAULT, consumer)) {
+      String[] colNames = new String[table0.getNumberOfColumns()];
+      Arrays.fill(colNames, "");
+      ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(colNames).build();
+      try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
         writer.write(table0);
@@ -6923,7 +6927,13 @@ void testORCWriteToFileWithColNames() throws IOException {
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
     try (Table table0 = getExpectedFileTable()) {
-      table0.writeORC(ORCWriterOptions.builder().withCompressionType(CompressionType.NONE).build(), tempFileUncompressed.getAbsoluteFile());
+      String[] colNames = new String[table0.getNumberOfColumns()];
+      Arrays.fill(colNames, "");
+      ORCWriterOptions opts = ORCWriterOptions.builder()
+              .withColumnNames(colNames)
+              .withCompressionType(CompressionType.NONE)
+              .build();
+      table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile());
       try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table2);
       }
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d89af43028d..3036b000c5b 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -70,13 +70,13 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
         cudf_table_view.table_view get_table() except+
-        const cudf_io_types.table_metadata *get_metadata() except+
+        const cudf_io_types.table_input_metadata *get_metadata() except+
 
         # setter
         void set_compression(cudf_io_types.compression_type comp) except+
         void enable_statistics(bool val) except+
         void set_table(cudf_table_view.table_view tbl) except+
-        void set_metadata(cudf_io_types.table_metadata* meta) except+
+        void set_metadata(cudf_io_types.table_input_metadata* meta) except+
 
         @staticmethod
         orc_writer_options_builder builder(
@@ -94,7 +94,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except+
         orc_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *meta
+            cudf_io_types.table_input_metadata *meta
         ) except+
 
         orc_writer_options build() except+
@@ -107,7 +107,7 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
         cudf_table_view.table_view get_table() except+
-        const cudf_io_types.table_metadata_with_nullability *get_metadata(
+        const cudf_io_types.table_input_metadata *get_metadata(
         ) except+
 
         # setter
@@ -115,7 +115,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_statistics(bool val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(
-            cudf_io_types.table_metadata_with_nullability* meta
+            cudf_io_types.table_input_metadata* meta
         ) except+
 
         @staticmethod
@@ -133,7 +133,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except+
         chunked_orc_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *meta
+            cudf_io_types.table_input_metadata *meta
         ) except+
 
         chunked_orc_writer_options build() except+
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index e2053f8ce4f..81ca7e5836b 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -66,36 +66,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cudf_io_types.table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
-    cdef cppclass column_in_metadata:
-        column_in_metadata& set_name(const string& name)
-        column_in_metadata& set_nullability(bool nullable)
-        column_in_metadata& set_list_column_as_map()
-        column_in_metadata& set_int96_timestamps(bool req)
-        column_in_metadata& set_decimal_precision(uint8_t precision)
-        column_in_metadata& child(size_type i)
-
-    cdef cppclass table_input_metadata:
-        table_input_metadata() except +
-        table_input_metadata(const cudf_table_view.table_view& table) except +
-        table_input_metadata(
-            const cudf_table_view.table_view& table,
-            map[string, string] user_data
-        ) except +
-
-        vector[column_in_metadata] column_metadata
-        map[string, string] user_data
-
     cdef cppclass parquet_writer_options:
         parquet_writer_options() except +
         cudf_io_types.sink_info get_sink_info() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
-        const table_input_metadata get_metadata() except +
+        const cudf_io_types.table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
 
         void set_metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -121,7 +102,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_
         ) except +
         parquet_writer_options_builder& metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
@@ -147,11 +128,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.sink_info get_sink() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        table_input_metadata* get_metadata(
+        cudf_io_types.table_input_metadata* get_metadata(
         ) except+
 
         void set_metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -171,7 +152,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_io_types.sink_info sink_,
         ) except +
         chunked_parquet_writer_options_builder& metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 9fb0e470950..4817cba9d74 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -8,7 +9,9 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
+cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
@@ -52,15 +55,29 @@ cdef extern from "cudf/io/types.hpp" \
         map[string, string] user_data
         vector[column_name_info] schema_info
 
-    cdef cppclass table_metadata_with_nullability(table_metadata):
-        table_metadata_with_nullability() except +
-
-        vector[bool] nullability
-
     cdef cppclass table_with_metadata:
         unique_ptr[table] tbl
         table_metadata metadata
 
+    cdef cppclass column_in_metadata:
+        column_in_metadata& set_name(const string& name)
+        column_in_metadata& set_nullability(bool nullable)
+        column_in_metadata& set_list_column_as_map()
+        column_in_metadata& set_int96_timestamps(bool req)
+        column_in_metadata& set_decimal_precision(uint8_t precision)
+        column_in_metadata& child(size_type i)
+
+    cdef cppclass table_input_metadata:
+        table_input_metadata() except +
+        table_input_metadata(const cudf_table_view.table_view& table) except +
+        table_input_metadata(
+            const cudf_table_view.table_view& table,
+            map[string, string] user_data
+        ) except +
+
+        vector[column_in_metadata] column_metadata
+        map[string, string] user_data
+
     cdef cppclass host_buffer:
         const char* data
         size_t size
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index bc4f4aee9cd..03d163b7638 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -23,13 +23,13 @@ from cudf._lib.cpp.io.orc_metadata cimport (
     read_raw_orc_statistics as libcudf_read_raw_orc_statistics,
 )
 from cudf._lib.cpp.io.types cimport (
+    column_in_metadata,
     column_name_info,
     compression_type,
     data_sink,
     sink_info,
     source_info,
-    table_metadata,
-    table_metadata_with_nullability,
+    table_input_metadata,
     table_with_metadata,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -50,7 +50,8 @@ import numpy as np
 
 from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
-from cudf._lib.utils import generate_pandas_metadata
+from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf.api.types import is_list_dtype, is_struct_dtype
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -144,19 +145,35 @@ cpdef write_orc(Table table,
     cudf.read_orc
     """
     cdef compression_type compression_ = _get_comp_type(compression)
-    cdef table_metadata metadata_ = table_metadata()
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-
-    metadata_.column_names.reserve(len(table._column_names))
-
-    for col_name in table._column_names:
-        metadata_.column_names.push_back(str.encode(col_name))
+    cdef unique_ptr[table_input_metadata] tbl_meta
+
+    if not isinstance(table._index, cudf.RangeIndex):
+        tv = table_view_from_table(table)
+        tbl_meta = make_unique[table_input_metadata](tv)
+        for level, idx_name in enumerate(table._index.names):
+            tbl_meta.get().column_metadata[level].set_name(
+                str.encode(
+                    _index_level_name(idx_name, level, table._column_names)
+                )
+            )
+        num_index_cols_meta = len(table._index.names)
+    else:
+        tv = table_view_from_table(table, ignore_index=True)
+        tbl_meta = make_unique[table_input_metadata](tv)
+        num_index_cols_meta = 0
+
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        _set_col_children_names(
+            table[name]._column, tbl_meta.get().column_metadata[i]
+        )
 
     cdef orc_writer_options c_orc_writer_options = move(
         orc_writer_options.builder(
             sink_info_c, table_view_from_table(table, ignore_index=True)
-        ).metadata(&metadata_)
+        ).metadata(tbl_meta.get())
         .compression(compression_)
         .enable_statistics(<bool> (True if enable_statistics else False))
         .build()
@@ -231,6 +248,7 @@ cdef class ORCWriter:
     cdef bool enable_stats
     cdef compression_type comp_type
     cdef object index
+    cdef unique_ptr[table_input_metadata] tbl_meta
 
     def __cinit__(self, object path, object index=None,
                   object compression=None, bool enable_statistics=True):
@@ -268,20 +286,46 @@ cdef class ORCWriter:
         """
         Prepare all the values required to build the
         chunked_orc_writer_options anb creates a writer"""
-        cdef unique_ptr[table_metadata_with_nullability] tbl_meta
-        tbl_meta = make_unique[table_metadata_with_nullability]()
+        cdef table_view tv
 
         # Set the table_metadata
-        tbl_meta.get().column_names = get_column_names(table, self.index)
+        num_index_cols_meta = 0
+        self.tbl_meta = make_unique[table_input_metadata](
+            table_view_from_table(table, ignore_index=True)
+        )
+        if self.index is not False:
+            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+                tv = table_view_from_table(table)
+                self.tbl_meta = make_unique[table_input_metadata](tv)
+                for level, idx_name in enumerate(table._index.names):
+                    self.tbl_meta.get().column_metadata[level].set_name(
+                        (str.encode(idx_name))
+                    )
+                num_index_cols_meta = len(table._index.names)
+            else:
+                if table._index.name is not None:
+                    tv = table_view_from_table(table)
+                    self.tbl_meta = make_unique[table_input_metadata](tv)
+                    self.tbl_meta.get().column_metadata[0].set_name(
+                        str.encode(table._index.name)
+                    )
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            _set_col_children_names(
+                table[name]._column, self.tbl_meta.get().column_metadata[i]
+            )
+
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        tbl_meta.get().user_data[str.encode("pandas")] = \
+        self.tbl_meta.get().user_data[str.encode("pandas")] = \
             str.encode(pandas_metadata)
 
         cdef chunked_orc_writer_options args
         with nogil:
             args = move(
                 chunked_orc_writer_options.builder(self.sink)
-                .metadata(tbl_meta.get())
+                .metadata(self.tbl_meta.get())
                 .compression(self.comp_type)
                 .enable_statistics(self.enable_stats)
                 .build()
@@ -289,3 +333,15 @@ cdef class ORCWriter:
             self.writer.reset(new orc_chunked_writer(args))
 
         self.initialized = True
+
+cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
+    if is_struct_dtype(col):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name.encode())
+            _set_col_children_names(child_col, col_meta.child(i))
+    elif is_list_dtype(col):
+        _set_col_children_names(col.children[1], col_meta.child(1))
+    else:
+        return
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d9017c7d6f8..70bdb6e2e60 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -45,15 +45,14 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    column_in_metadata,
     merge_rowgroup_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
     parquet_writer_options,
     read_parquet as parquet_reader,
-    table_input_metadata,
     write_parquet as parquet_writer,
 )
+from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index dd12c92a15a..810cdd51df5 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
@@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index):
         ):
             types.append(col.dtype.to_arrow())
         else:
-            types.append(np_to_pa_dtype(col.dtype))
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interperable
+            # in arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
     if index is not False:
@@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 elif is_list_dtype(idx):
                     types.append(col.dtype.to_arrow())
                 else:
-                    types.append(np_to_pa_dtype(idx.dtype))
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
                 index_levels.append(idx)
             col_names.append(name)
             index_descriptors.append(descr)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 1fe59d3dfd6..b2f3274faab 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -44,6 +44,11 @@ def _values(self) -> ColumnBase:
     def copy(self, deep: bool = True) -> BaseIndex:
         raise NotImplementedError
 
+    @property
+    def size(self):
+        # The size of an index is always its length irrespective of dimension.
+        return len(self)
+
     @property
     def values(self):
         return self._values.values
@@ -162,6 +167,38 @@ def _clean_nulls_from_index(self):
         else:
             return self
 
+    @property
+    def is_monotonic(self):
+        """Return boolean if values in the object are monotonic_increasing.
+
+        This property is an alias for :attr:`is_monotonic_increasing`.
+
+        Returns
+        -------
+        bool
+        """
+        return self.is_monotonic_increasing
+
+    @property
+    def is_monotonic_increasing(self):
+        """Return boolean if values in the object are monotonically increasing.
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    @property
+    def is_monotonic_decreasing(self):
+        """Return boolean if values in the object are monotonically decreasing.
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
     @property
     def nlevels(self):
         """
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8f18d83eb31..de278db919d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2089,10 +2089,7 @@ def as_column(
                             data
                         )
                     np_type = np.dtype(dtype).type
-                    if np_type == np.bool_:
-                        pa_type = pa.bool_()
-                    else:
-                        pa_type = np_to_pa_dtype(np.dtype(dtype))
+                    pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c14cbd11714..c59081e4b59 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -352,7 +352,9 @@ def cat(self, others=None, sep=None, na_rep=None):
 
         if len(data) == 1 and data.null_count == 1:
             data = [""]
-        out = self._return_or_inplace(data)
+        # We only want to keep the index if we are adding something to each
+        # row, not if we are joining all the rows into a single string.
+        out = self._return_or_inplace(data, retain_index=others is not None)
         if len(out) == 1 and others is None:
             if isinstance(out, cudf.Series):
                 out = out.iloc[0]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1143f85a4e6..bdbd94ef754 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -594,12 +594,6 @@ def dtypes(self):
             data=[x.dtype for x in self._data.columns], index=self._data.names,
         )
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the DataFrame.
-        """
-        return self._num_rows, self._num_columns
-
     @property
     def ndim(self):
         """Dimension of the data. DataFrame ndim is always 2.
@@ -938,12 +932,6 @@ def memory_usage(self, index=True, deep=False):
             sizes.append(self.index.memory_usage(deep=deep))
         return Series(sizes, index=ind)
 
-    def __len__(self):
-        """
-        Returns the number of rows
-        """
-        return len(self.index)
-
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         import cudf
 
@@ -4742,7 +4730,9 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
-    def apply(self, func, axis=1):
+    def apply(
+        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+    ):
         """
         Apply a function along an axis of the DataFrame.
 
@@ -4756,12 +4746,17 @@ def apply(self, func, axis=1):
         ----------
         func : function
             Function to apply to each row.
-
         axis : {0 or 'index', 1 or 'columns'}, default 0
             Axis along which the function is applied:
             * 0 or 'index': apply function to each column.
               Note: axis=0 is not yet supported.
             * 1 or 'columns': apply function to each row.
+        raw: bool, default False
+            Not yet supported
+        result_type: {'expand', 'reduce', 'broadcast', None}, default None
+            Not yet supported
+        args: tuple
+            Not yet supported
 
         Examples
         --------
@@ -4910,6 +4905,12 @@ def apply(self, func, axis=1):
             raise ValueError(
                 "DataFrame.apply currently only supports row wise ops"
             )
+        if raw:
+            raise ValueError("The `raw` kwarg is not yet supported.")
+        if result_type is not None:
+            raise ValueError("The `result_type` kwarg is not yet supported.")
+        if args or kwargs:
+            raise ValueError("args and kwargs are not yet supported.")
 
         return cudf.Series(func(self))
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 09a6df67da5..28080cbc4c1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -166,6 +166,11 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
+    @property
+    def shape(self):
+        """Returns a tuple representing the dimensionality of the DataFrame."""
+        return self._num_rows, self._num_columns
+
     @property
     def _is_homogeneous(self):
         # make sure that the dataframe has columns
@@ -4547,6 +4552,12 @@ def to_string(self):
     def __str__(self):
         return self.to_string()
 
+    def __deepcopy__(self, memo):
+        return self.copy(deep=True)
+
+    def __copy__(self):
+        return self.copy(deep=False)
+
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -4815,9 +4826,6 @@ def __iter__(self):
         """
         cudf.utils.utils.raise_iteration_error(obj=self)
 
-    def __len__(self):
-        return len(self._column)
-
     def __bool__(self):
         raise TypeError(
             f"The truth value of a {type(self)} is ambiguous. Use "
@@ -5005,7 +5013,7 @@ def is_unique(self):
 
     @property
     def is_monotonic(self):
-        """Return boolean if values in the object are monotonic_increasing.
+        """Return boolean if values in the object are monotonically increasing.
 
         This property is an alias for :attr:`is_monotonic_increasing`.
 
@@ -5017,7 +5025,7 @@ def is_monotonic(self):
 
     @property
     def is_monotonic_increasing(self):
-        """Return boolean if values in the object are monotonic_increasing.
+        """Return boolean if values in the object are monotonically increasing.
 
         Returns
         -------
@@ -5027,7 +5035,7 @@ def is_monotonic_increasing(self):
 
     @property
     def is_monotonic_decreasing(self):
-        """Return boolean if values in the object are monotonic_decreasing.
+        """Return boolean if values in the object are monotonically decreasing.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3ac30143463..6414d4a7e84 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -349,17 +349,6 @@ def dtype(self):
         """
         return cudf.dtype(np.int64)
 
-    @property
-    def is_contiguous(self):
-        """
-        Returns if the index is contiguous.
-        """
-        return self._step == 1
-
-    @property
-    def size(self):
-        return len(self)
-
     def find_label_range(self, first=None, last=None):
         """Find subrange in the ``RangeIndex``, marked by their positions, that
         starts greater or equal to ``first`` and ends less or equal to ``last``
@@ -417,18 +406,10 @@ def is_unique(self):
 
     @property
     def is_monotonic_increasing(self):
-        """
-        Return if the index is monotonic increasing
-        (only equal or increasing) values.
-        """
         return self._step > 0 or len(self) <= 1
 
     @property
     def is_monotonic_decreasing(self):
-        """
-        Return if the index is monotonic decreasing
-        (only equal or decreasing) values.
-        """
         return self._step < 0 or len(self) <= 1
 
     def get_slice_bound(self, label, side, kind=None):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 84566b4627c..bc97c72db88 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -7,6 +7,7 @@
 import pickle
 import warnings
 from collections.abc import Sequence
+from numbers import Integral
 from typing import Any, List, MutableMapping, Optional, Tuple, Union
 
 import cupy
@@ -17,6 +18,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
+from cudf.api.types import is_integer, is_list_like
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.frame import Frame
@@ -33,8 +35,6 @@ class MultiIndex(Frame, BaseIndex):
     ----------
     levels : sequence of arrays
         The unique labels for each level.
-    labels : sequence of arrays
-        labels is depreciated, please use levels
     codes: sequence of arrays
         Integers for each level designating which label at each location.
     sortorder : optional int
@@ -68,7 +68,6 @@ def __init__(
         levels=None,
         codes=None,
         sortorder=None,
-        labels=None,
         names=None,
         dtype=None,
         copy=False,
@@ -78,13 +77,16 @@ def __init__(
 
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")
-
         if name is not None:
             raise NotImplementedError(
                 "Use `names`, `name` is not yet supported"
             )
-
-        super().__init__()
+        if len(levels) == 0:
+            raise ValueError("Must pass non-zero number of levels/codes")
+        if not isinstance(codes, cudf.DataFrame) and not isinstance(
+            codes[0], (Sequence, np.ndarray)
+        ):
+            raise TypeError("Codes is not a Sequence of sequences")
 
         if copy:
             if isinstance(codes, cudf.DataFrame):
@@ -92,58 +94,57 @@ def __init__(
             if len(levels) > 0 and isinstance(levels[0], cudf.Series):
                 levels = [level.copy(deep=True) for level in levels]
 
-        self._name = None
-
-        if labels:
-            warnings.warn(
-                "the 'labels' keyword is deprecated, use 'codes' " "instead",
-                FutureWarning,
-            )
-        if labels and not codes:
-            codes = labels
-
-        if len(levels) == 0:
-            raise ValueError("Must pass non-zero number of levels/codes")
+        if not isinstance(codes, cudf.DataFrame):
+            if len(levels) == len(codes):
+                codes = cudf.DataFrame._from_data(
+                    {
+                        i: column.as_column(code).astype(np.int64)
+                        for i, code in enumerate(codes)
+                    }
+                )
+            else:
+                raise ValueError(
+                    "MultiIndex has unequal number of levels and "
+                    "codes and is inconsistent!"
+                )
 
-        if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (Sequence, np.ndarray)
-        ):
-            raise TypeError("Codes is not a Sequence of sequences")
+        levels = [cudf.Series(level) for level in levels]
 
-        if isinstance(codes, cudf.DataFrame):
-            self._codes = codes
-        elif len(levels) == len(codes):
-            self._codes = cudf.DataFrame._from_data(
-                {
-                    i: column.as_column(code).astype(np.int64)
-                    for i, code in enumerate(codes)
-                }
-            )
-        else:
+        if len(levels) != len(codes.columns):
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
                 "codes and is inconsistent!"
             )
+        if len(set(c.size for c in codes._data.columns)) != 1:
+            raise ValueError(
+                "MultiIndex length of codes does not match "
+                "and is inconsistent!"
+            )
+        for level, code in zip(levels, codes._data.columns):
+            if code.max() > len(level) - 1:
+                raise ValueError(
+                    "MultiIndex code %d contains value %d larger "
+                    "than maximum level size at this position"
+                )
 
-        self._levels = [cudf.Series(level) for level in levels]
-        self._validate_levels_and_codes(self._levels, self._codes)
-
-        source_data = cudf.DataFrame()
-        for i, n in enumerate(self._codes.columns):
-            codes = as_index(self._codes[n]._column)
-            if -1 in self._codes[n].values:
+        source_data = {}
+        for i, (column_name, col) in enumerate(codes._data.items()):
+            if -1 in col.values:
                 level = cudf.DataFrame(
-                    {n: [None] + list(self._levels[i])},
-                    index=range(-1, len(self._levels[i])),
+                    {column_name: [None] + list(levels[i])},
+                    index=range(-1, len(levels[i])),
                 )
             else:
-                level = cudf.DataFrame({n: self._levels[i]})
+                level = cudf.DataFrame({column_name: levels[i]})
 
-            source_data[n] = libcudf.copying.gather(
-                level, codes._data.columns[0]
-            )[0][n]
+            source_data[column_name] = libcudf.copying.gather(level, col)[0][
+                column_name
+            ]
 
-        self._data = source_data._data
+        super().__init__(source_data)
+        self._levels = levels
+        self._codes = codes
+        self._name = None
         self.names = names
 
     @property
@@ -153,7 +154,6 @@ def names(self):
     @names.setter
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
-        assert len(value) == self.nlevels
 
         if len(value) == len(set(value)):
             # IMPORTANT: if the provided names are unique,
@@ -216,25 +216,20 @@ def rename(self, names, inplace=False):
         return self.set_names(names, level=None, inplace=inplace)
 
     def set_names(self, names, level=None, inplace=False):
-        if (
-            level is not None
-            and not cudf.api.types.is_list_like(level)
-            and cudf.api.types.is_list_like(names)
-        ):
+        names_is_list_like = is_list_like(names)
+        level_is_list_like = is_list_like(level)
+
+        if level is not None and not level_is_list_like and names_is_list_like:
             raise TypeError(
                 "Names must be a string when a single level is provided."
             )
 
-        if (
-            not cudf.api.types.is_list_like(names)
-            and level is None
-            and self.nlevels > 1
-        ):
+        if not names_is_list_like and level is None and self.nlevels > 1:
             raise TypeError("Must pass list-like as `names`.")
 
-        if not cudf.api.types.is_list_like(names):
+        if not names_is_list_like:
             names = [names]
-        if level is not None and not cudf.api.types.is_list_like(level):
+        if level is not None and not level_is_list_like:
             level = [level]
 
         if level is not None and len(names) != len(level):
@@ -269,10 +264,6 @@ def _from_data(
             obj.name = name
         return obj
 
-    @property
-    def shape(self):
-        return (self._data.nrows, len(self._data.names))
-
     @property
     def name(self):
         return self._name
@@ -281,26 +272,6 @@ def name(self):
     def name(self, value):
         self._name = value
 
-    def _validate_levels_and_codes(self, levels, codes):
-        if len(levels) != len(codes.columns):
-            raise ValueError(
-                "MultiIndex has unequal number of levels and "
-                "codes and is inconsistent!"
-            )
-        code_length = len(codes[codes.columns[0]])
-        for index, code in enumerate(codes):
-            if code_length != len(codes[code]):
-                raise ValueError(
-                    "MultiIndex length of codes does not match "
-                    "and is inconsistent!"
-                )
-        for index, code in enumerate(codes):
-            if codes[code].max() > len(levels[index]) - 1:
-                raise ValueError(
-                    "MultiIndex code %d contains value %d larger "
-                    "than maximum level size at this position"
-                )
-
     def copy(
         self,
         names=None,
@@ -396,36 +367,9 @@ def copy(
 
         return mi
 
-    def deepcopy(self):
-        return self.copy(deep=True)
-
-    def __copy__(self):
-        return self.copy(deep=True)
-
     def __iter__(self):
-        """
-        Iterating over a GPU object is not effecient and hence not supported.
-
-        Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host``
-        if you wish to iterate over the values.
-        """
         cudf.utils.utils.raise_iteration_error(obj=self)
 
-    def _popn(self, n):
-        """ Returns a copy of this index without the left-most n values.
-
-        Removes n names, labels, and codes in order to build a new index
-        for results.
-        """
-        result = MultiIndex(
-            levels=self.levels[n:],
-            codes=self.codes.iloc[:, n:],
-            names=self.names[n:],
-        )
-        if self.names is not None:
-            result.names = self.names[n:]
-        return result
-
     def __repr__(self):
         max_seq_items = get_option("display.max_seq_items") or len(self)
 
@@ -534,9 +478,7 @@ def codes(self):
 
     @property
     def nlevels(self):
-        """
-        Integer number of levels in this MultiIndex.
-        """
+        """Integer number of levels in this MultiIndex."""
         return len(self._data)
 
     @property
@@ -576,23 +518,13 @@ def levels(self):
             self._compute_levels_and_codes()
         return self._levels
 
-    @property
-    def labels(self):
-        warnings.warn(
-            "This feature is deprecated in pandas and will be"
-            "dropped from cudf as well.",
-            FutureWarning,
-        )
-        return self.codes
-
     @property
     def ndim(self):
-        """Dimension of the data. For MultiIndex ndim is always 2.
-        """
+        """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
     def _get_level_label(self, level):
-        """ Get name of the level.
+        """Get name of the level.
 
         Parameters
         ----------
@@ -658,8 +590,6 @@ def isin(self, values, level=None):
         >>> midx.isin([(1, 'red'), (3, 'red')])
         array([ True, False, False])
         """
-        from cudf.api.types import is_list_like
-
         if level is None:
             if isinstance(values, cudf.MultiIndex):
                 values_idx = values
@@ -708,11 +638,6 @@ def isin(self, values, level=None):
 
         return result
 
-    def mask(self, cond, other=None, inplace=False):
-        raise NotImplementedError(
-            ".mask is not supported for MultiIndex operations"
-        )
-
     def where(self, cond, other=None, inplace=False):
         raise NotImplementedError(
             ".where is not supported for MultiIndex operations"
@@ -795,9 +720,7 @@ def _index_and_downcast(self, result, index, index_key):
         ) or isinstance(index_key[0], slice):
             index_key = index_key[0]
 
-        slice_access = False
-        if isinstance(index_key, slice):
-            slice_access = True
+        slice_access = isinstance(index_key, slice)
         out_index = cudf.DataFrame()
         # Select the last n-k columns where n is the number of columns and k is
         # the length of the indexing tuple
@@ -805,30 +728,24 @@ def _index_and_downcast(self, result, index, index_key):
         if not isinstance(index_key, (numbers.Number, slice)):
             size = len(index_key)
         for k in range(size, len(index._data)):
-            if index.names is None:
-                name = k
-            else:
-                name = index.names[k]
             out_index.insert(
-                len(out_index.columns),
-                name,
+                out_index._num_columns,
+                k if index.names is None else index.names[k],
                 cudf.Series._from_data({None: index._data.columns[k]}),
             )
 
-        if len(result) == 1 and size == 0 and slice_access is False:
+        if len(result) == 1 and size == 0 and not slice_access:
             # If the final result is one row and it was not mapped into
             # directly, return a Series with a tuple as name.
             result = result.T
             result = result[result._data.names[0]]
-        elif len(result) == 0 and slice_access is False:
+        elif len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
-            series_name = []
-            for col in index._data.columns:
-                series_name.append(col[0])
-            result = cudf.Series([])
-            result.name = tuple(series_name)
-        elif len(out_index.columns) == 1:
+            result = cudf.Series._from_data(
+                {}, name=tuple((col[0] for col in index._data.columns))
+            )
+        elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
@@ -836,11 +753,18 @@ def _index_and_downcast(self, result, index, index_key):
             out_index = as_index(last_column)
             out_index.name = index.names[-1]
             index = out_index
-        elif len(out_index.columns) > 1:
+        elif out_index._num_columns > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
-            index = index._popn(size)
+            if index.names is not None:
+                result.names = index.names[size:]
+            index = MultiIndex(
+                levels=index.levels[size:],
+                codes=index.codes.iloc[:, size:],
+                names=index.names[size:],
+            )
+
         if isinstance(index_key, tuple):
             result = result.set_index(index)
         return result
@@ -896,24 +820,6 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    def _split_tuples(self, tuples):
-        if len(tuples) == 1:
-            return tuples, slice(None)
-        elif isinstance(tuples[0], tuple):
-            row = tuples[0]
-            if len(tuples) == 1:
-                column = slice(None)
-            else:
-                column = tuples[1]
-            return row, column
-        elif isinstance(tuples[0], slice):
-            return tuples
-        else:
-            return tuples, slice(None)
-
-    def __len__(self):
-        return self._data.nrows
-
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             for self_col, other_col in zip(
@@ -924,24 +830,16 @@ def __eq__(self, other):
             return self.names == other.names
         return NotImplemented
 
-    @property
-    def is_contiguous(self):
-        return True
-
     @property
     def size(self):
-        return len(self)
+        # The size of a MultiIndex is only dependent on the number of rows.
+        return self._num_rows
 
     def take(self, indices):
-        from collections.abc import Sequence
-        from numbers import Integral
-
         if isinstance(indices, (Integral, Sequence)):
             indices = np.array(indices)
-        elif isinstance(indices, cudf.Series):
-            if indices.has_nulls:
-                raise ValueError("Column must have no nulls.")
-            indices = indices
+        elif isinstance(indices, cudf.Series) and indices.has_nulls:
+            raise ValueError("Column must have no nulls.")
         elif isinstance(indices, slice):
             start, stop, step = indices.indices(len(self))
             indices = column.arange(start, stop, step)
@@ -977,21 +875,17 @@ def deserialize(cls, header, frames):
             )
             df = cudf.DataFrame.deserialize(header["source_data"], frames)
             obj = cls.from_frame(df)
-            obj._set_names(names)
-            return obj
+            return obj._set_names(names)
         columns = column.deserialize_columns(header["columns"], frames)
-        return cls._from_data(dict(zip(names, columns)))
+        obj = cls._from_data(dict(zip(range(0, len(names)), columns)))
+        return obj._set_names(names)
 
     def __getitem__(self, index):
-        match = self.take(index)
-        if isinstance(index, slice):
-            return match
         if isinstance(index, int):
             # we are indexing into a single row of the MultiIndex,
             # return that row as a tuple:
-            return match.to_pandas()[0]
-        else:
-            return match
+            return self.take(index).to_pandas()[0]
+        return self.take(index)
 
     def to_frame(self, index=True, name=None):
         # TODO: Currently this function makes a shallow copy, which is
@@ -1003,7 +897,7 @@ def to_frame(self, index=True, name=None):
         if name is not None:
             if len(name) != len(self.levels):
                 raise ValueError(
-                    "'name' should have th same length as "
+                    "'name' should have the same length as "
                     "number of levels on index."
                 )
             df.columns = name
@@ -1095,8 +989,7 @@ def from_tuples(cls, tuples, names=None):
         """
         # Use Pandas for handling Python host objects
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
-        result = cls.from_pandas(pdi)
-        return result
+        return cls.from_pandas(pdi)
 
     @property
     def values_host(self):
@@ -1426,18 +1319,6 @@ def from_pandas(cls, multiindex, nan_as_null=None):
     def is_unique(self):
         return len(self) == len(self.unique())
 
-    @property
-    def is_monotonic(self):
-        """Return boolean if values in the object are monotonic_increasing.
-
-        This property is an alias for :attr:`is_monotonic_increasing`.
-
-        Returns
-        -------
-        bool
-        """
-        return self.is_monotonic_increasing
-
     @property
     def is_monotonic_increasing(self):
         """
@@ -1539,13 +1420,9 @@ def memory_usage(self, deep=False):
         return n
 
     def difference(self, other, sort=None):
-        temp_self = self
-        temp_other = other
-        if hasattr(self, "to_pandas"):
-            temp_self = self.to_pandas()
         if hasattr(other, "to_pandas"):
-            temp_other = self.to_pandas()
-        return temp_self.difference(temp_other, sort)
+            other = other.to_pandas()
+        return self.to_pandas().difference(other, sort)
 
     def append(self, other):
         """
@@ -1609,12 +1486,6 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    def nan_to_num(*args, **kwargs):
-        return args[0]
-
-    def array_equal(*args, **kwargs):
-        return args[0] == args[1]
-
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1650,8 +1521,8 @@ def _level_index_from_level(self, level):
         try:
             return self.names.index(level)
         except ValueError:
-            if not pd.api.types.is_integer(level):
-                raise KeyError(f"Level {level} not found") from None
+            if not is_integer(level):
+                raise KeyError(f"Level {level} not found")
             if level < 0:
                 level += self.nlevels
             if level >= self.nlevels:
@@ -1661,9 +1532,6 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    def _level_name_from_level(self, level):
-        return self.names[self._level_index_from_level(level)]
-
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get location for a label or a tuple of labels.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4ad2c325eeb..594f9fc42d0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3637,6 +3637,12 @@ def label_encoding(self, cats, dtype=None, na_sentinel=-1):
         dtype: int8
         """
 
+        warnings.warn(
+            "Series.label_encoding is deprecated and will be removed in the future.\
+                 Consider using cuML's LabelEncoder instead",
+            DeprecationWarning,
+        )
+
         def _return_sentinel_series():
             return Series(
                 cudf.core.column.full(
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 73fbd50c824..3aa672223c9 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -41,37 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob):
         column_statistics["number_of_values"] = cs.numberOfValues
     if cs.HasField("hasNull"):
         column_statistics["has_null"] = cs.hasNull
+
     if cs.HasField("intStatistics"):
-        column_statistics["minimum"] = cs.intStatistics.minimum
-        column_statistics["maximum"] = cs.intStatistics.maximum
-        column_statistics["sum"] = cs.intStatistics.sum
+        column_statistics["minimum"] = (
+            cs.intStatistics.minimum
+            if cs.intStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.intStatistics.maximum
+            if cs.intStatistics.HasField("maximum")
+            else None
+        )
+        column_statistics["sum"] = (
+            cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
+        )
+
     elif cs.HasField("doubleStatistics"):
-        column_statistics["minimum"] = cs.doubleStatistics.minimum
-        column_statistics["maximum"] = cs.doubleStatistics.maximum
-        column_statistics["sum"] = cs.doubleStatistics.sum
+        column_statistics["minimum"] = (
+            cs.doubleStatistics.minimum
+            if cs.doubleStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.doubleStatistics.maximum
+            if cs.doubleStatistics.HasField("maximum")
+            else None
+        )
+        column_statistics["sum"] = (
+            cs.doubleStatistics.sum
+            if cs.doubleStatistics.HasField("sum")
+            else None
+        )
+
     elif cs.HasField("stringStatistics"):
-        column_statistics["minimum"] = cs.stringStatistics.minimum
-        column_statistics["maximum"] = cs.stringStatistics.maximum
+        column_statistics["minimum"] = (
+            cs.stringStatistics.minimum
+            if cs.stringStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.stringStatistics.maximum
+            if cs.stringStatistics.HasField("maximum")
+            else None
+        )
         column_statistics["sum"] = cs.stringStatistics.sum
+
     elif cs.HasField("bucketStatistics"):
         column_statistics["true_count"] = cs.bucketStatistics.count[0]
         column_statistics["false_count"] = (
             column_statistics["number_of_values"]
             - column_statistics["true_count"]
         )
+
     elif cs.HasField("decimalStatistics"):
-        column_statistics["minimum"] = cs.decimalStatistics.minimum
-        column_statistics["maximum"] = cs.decimalStatistics.maximum
+        column_statistics["minimum"] = (
+            cs.decimalStatistics.minimum
+            if cs.decimalStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.decimalStatistics.maximum
+            if cs.decimalStatistics.HasField("maximum")
+            else None
+        )
         column_statistics["sum"] = cs.decimalStatistics.sum
+
     elif cs.HasField("dateStatistics"):
-        column_statistics["minimum"] = datetime.datetime.fromtimestamp(
-            datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
-            datetime.timezone.utc,
+        column_statistics["minimum"] = (
+            datetime.datetime.fromtimestamp(
+                datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
+                datetime.timezone.utc,
+            )
+            if cs.dateStatistics.HasField("minimum")
+            else None
         )
-        column_statistics["maximum"] = datetime.datetime.fromtimestamp(
-            datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
-            datetime.timezone.utc,
+        column_statistics["maximum"] = (
+            datetime.datetime.fromtimestamp(
+                datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
+                datetime.timezone.utc,
+            )
+            if cs.dateStatistics.HasField("maximum")
+            else None
         )
+
     elif cs.HasField("timestampStatistics"):
         # Before ORC-135, the local timezone offset was included and they were
         # stored as minimum and maximum. After ORC-135, the timestamp is
@@ -87,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
             column_statistics["maximum"] = datetime.datetime.fromtimestamp(
                 cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
             )
+
     elif cs.HasField("binaryStatistics"):
         column_statistics["sum"] = cs.binaryStatistics.sum
 
@@ -338,11 +392,11 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
 
     for col in df._data.columns:
         if isinstance(col, cudf.core.column.StructColumn):
-            raise NotImplementedError(
-                "Writing to ORC format is not yet supported with "
-                "Struct columns."
+            warnings.warn(
+                "Support for writing tables with struct columns is "
+                "currently experimental."
             )
-        elif isinstance(col, cudf.core.column.CategoricalColumn):
+        if isinstance(col, cudf.core.column.CategoricalColumn):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index d98ab0504cc..877cec24afa 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect):
 def test_dtype_raise(in_dtype):
     with pytest.raises(TypeError):
         cudf.dtype(in_dtype)
+
+
+def test_dtype_np_bool_to_pa_bool():
+    """This test case captures that utility np_to_pa_dtype
+    should map np.bool_ to pa.bool_, nuances on bit width
+    difference should be handled elsewhere.
+    """
+
+    assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 465cf36e1f3..e2b1d72c63e 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -5,7 +5,9 @@
 """
 import itertools
 import operator
+import pickle
 import re
+from io import BytesIO
 
 import cupy as cp
 import numpy as np
@@ -1553,3 +1555,49 @@ def test_multiIndex_duplicate_names():
     )
 
     assert_eq(gi, pi)
+
+
+def test_difference():
+    midx = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
+        names=["x", "y"],
+    )
+    midx2 = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]],
+        names=["x", "y"],
+    )
+
+    expected = midx2.to_pandas().difference(midx.to_pandas())
+    actual = midx2.difference(midx)
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "names",
+    [
+        ["a", "b", "c"],
+        [None, None, None],
+        ["aa", "aa", "aa"],
+        ["bb", "aa", "aa"],
+        None,
+    ],
+)
+def test_pickle_roundtrip_multiIndex(names):
+    df = cudf.DataFrame(
+        {
+            "one": [1, 2, 3],
+            "two": [True, False, True],
+            "three": ["ab", "cd", "ef"],
+            "four": [0.2, 0.1, -10.2],
+        }
+    )
+    expected_df = df.set_index(["one", "two", "three"])
+    expected_df.index.names = names
+    local_file = BytesIO()
+
+    pickle.dump(expected_df, local_file)
+    local_file.seek(0)
+    actual_df = pickle.load(local_file)
+    assert_eq(expected_df, actual_df)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 2d4dc55bd28..1230b4b35f3 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -5,6 +5,7 @@
 import os
 import random
 from io import BytesIO
+from string import ascii_lowercase
 
 import numpy as np
 import pandas as pd
@@ -58,7 +59,6 @@ def _make_path_or_buf(src):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings("ignore:Strings are not yet supported")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize("use_index", [False, True])
 @pytest.mark.parametrize(
@@ -221,6 +221,7 @@ def test_orc_read_statistics(datadir):
     assert_eq(file_statistics[0]["string1"]["minimum"], "one")
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 @pytest.mark.parametrize(
     "predicate,expected_len",
@@ -244,6 +245,7 @@ def test_orc_read_filtered(datadir, engine, predicate, expected_len):
     assert len(df_filtered) == expected_len
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 def test_orc_read_stripes(datadir, engine):
     path = datadir / "TestOrcFile.testDate1900.orc"
@@ -558,7 +560,6 @@ def test_orc_reader_boolean_type(datadir, orc_file):
     assert_eq(pdf, df)
 
 
-@pytest.mark.filterwarnings("ignore:Using CPU")
 def test_orc_reader_tzif_timestamps(datadir):
     # Contains timstamps in the range covered by the TZif file
     # Other timedate tests only cover "future" times
@@ -954,7 +955,9 @@ def generate_list_struct_buff(size=100_000):
     return buff
 
 
-list_struct_buff = generate_list_struct_buff()
+@pytest.fixture(scope="module")
+def list_struct_buff():
+    return generate_list_struct_buff()
 
 
 @pytest.mark.parametrize(
@@ -967,9 +970,7 @@ def generate_list_struct_buff(size=100_000):
 )
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
-def test_lists_struct_nests(
-    columns, num_rows, use_index,
-):
+def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
 
     gdf = cudf.read_orc(
         list_struct_buff,
@@ -993,7 +994,7 @@ def test_lists_struct_nests(
 
 
 @pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]])
-def test_skip_rows_for_nested_types(columns):
+def test_skip_rows_for_nested_types(columns, list_struct_buff):
     with pytest.raises(
         RuntimeError, match="skip_rows is not supported by nested column"
     ):
@@ -1379,3 +1380,115 @@ def test_names_in_struct_dtype_nesting(datadir):
     edf = cudf.DataFrame(expect.to_pandas())
     # test schema
     assert edf.dtypes.equals(got.dtypes)
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+def test_writer_lists_structs(list_struct_buff):
+    df_in = cudf.read_orc(list_struct_buff)
+
+    buff = BytesIO()
+    df_in.to_orc(buff)
+
+    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()
+
+    assert pyarrow_tbl.equals(df_in.to_arrow())
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "with_pd": [
+                [i if i % 3 else None] if i < 9999 or i > 20001 else None
+                for i in range(21000)
+            ],
+            "no_pd": [
+                [i if i % 3 else None] if i < 9999 or i > 20001 else []
+                for i in range(21000)
+            ],
+        },
+    ],
+)
+def test_orc_writer_lists_empty_rg(data):
+    pdf_in = pd.DataFrame(data)
+    buffer = BytesIO()
+    cudf_in = cudf.from_pandas(pdf_in)
+
+    cudf_in.to_orc(buffer)
+
+    df = cudf.read_orc(buffer)
+    assert_eq(df, cudf_in)
+
+    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    assert_eq(pdf_in, pdf_out)
+
+
+def test_statistics_sum_overflow():
+    maxint64 = np.iinfo(np.int64).max
+    minint64 = np.iinfo(np.int64).min
+
+    buff = BytesIO()
+    with po.Writer(
+        buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())
+    ) as writer:
+        writer.write((maxint64, minint64, minint64))
+        writer.write((1, -1, 1))
+
+    file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
+    assert file_stats[0]["a"].get("sum") is None
+    assert file_stats[0]["b"].get("sum") is None
+    assert file_stats[0]["c"].get("sum") == minint64 + 1
+
+    assert stripe_stats[0]["a"].get("sum") is None
+    assert stripe_stats[0]["b"].get("sum") is None
+    assert stripe_stats[0]["c"].get("sum") == minint64 + 1
+
+
+def test_empty_statistics():
+    buff = BytesIO()
+    orc_schema = po.Struct(
+        a=po.BigInt(),
+        b=po.Double(),
+        c=po.String(),
+        d=po.Decimal(11, 2),
+        e=po.Date(),
+        f=po.Timestamp(),
+        g=po.Boolean(),
+        h=po.Binary(),
+        i=po.BigInt(),
+        # One column with non null value, else cudf/pyorc readers crash
+    )
+    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
+    with po.Writer(buff, orc_schema) as writer:
+        writer.write(data)
+
+    got = cudf.io.orc.read_orc_statistics([buff])
+
+    # Check for both file and stripe stats
+    for stats in got:
+        # Similar expected stats for the first 6 columns in this case
+        for col_name in ascii_lowercase[:6]:
+            assert stats[0][col_name].get("number_of_values") == 0
+            assert stats[0][col_name].get("has_null") is True
+            assert stats[0][col_name].get("minimum") is None
+            assert stats[0][col_name].get("maximum") is None
+        for col_name in ascii_lowercase[:3]:
+            assert stats[0][col_name].get("sum") == 0
+        # Sum for decimal column is a string
+        assert stats[0]["d"].get("sum") == "0"
+
+        assert stats[0]["g"].get("number_of_values") == 0
+        assert stats[0]["g"].get("has_null") is True
+        assert stats[0]["g"].get("true_count") == 0
+        assert stats[0]["g"].get("false_count") == 0
+
+        assert stats[0]["h"].get("number_of_values") == 0
+        assert stats[0]["h"].get("has_null") is True
+        assert stats[0]["h"].get("sum") == 0
+
+        assert stats[0]["i"].get("number_of_values") == 1
+        assert stats[0]["i"].get("has_null") is False
+        assert stats[0]["i"].get("minimum") == 1
+        assert stats[0]["i"].get("maximum") == 1
+        assert stats[0]["i"].get("sum") == 1
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 133597b8f19..11ed68056b6 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -122,19 +122,41 @@ def pdf(scope="module"):
     return df
 
 
-def test_read_csv(s3_base, s3so, pdf):
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     # Write to buffer
     fname = "test_csv_reader.csv"
     bname = "csv"
     buffer = pdf.to_csv(index=False)
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_csv(
-            "s3://{}/{}".format(bname, fname), storage_options=s3so
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
         )
 
     assert_eq(pdf, got)
 
 
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread):
+    # Write to buffer
+    fname = "test_csv_reader_byte_range.csv"
+    bname = "csv"
+    buffer = pdf.to_csv(index=False)
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        got = cudf.read_csv(
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            byte_range=(74, 73),
+            bytes_per_thread=bytes_per_thread,
+            header=False,
+            names=["Integer", "Float", "Integer2", "String", "Boolean"],
+        )
+
+    assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
+
+
 @pytest.mark.parametrize("chunksize", [None, 3])
 def test_write_csv(s3_base, s3so, pdf, chunksize):
     # Write to buffer
@@ -156,7 +178,9 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
     assert_eq(pdf, got)
 
 
-def test_read_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+@pytest.mark.parametrize("columns", [None, ["Float", "String"]])
+def test_read_parquet(s3_base, s3so, pdf, bytes_per_thread, columns):
     fname = "test_parquet_reader.parquet"
     bname = "parquet"
     buffer = BytesIO()
@@ -164,10 +188,32 @@ def test_read_parquet(s3_base, s3so, pdf):
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_parquet(
-            "s3://{}/{}".format(bname, fname), storage_options=s3so
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
+            columns=columns,
         )
 
-    assert_eq(pdf, got)
+    expect = pdf[columns] if columns else pdf
+    assert_eq(expect, got)
+
+
+def test_read_parquet_filters(s3_base, s3so, pdf):
+    fname = "test_parquet_reader_filters.parquet"
+    bname = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    filters = [("String", "==", "Omega")]
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        got = cudf.read_parquet(
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            filters=filters,
+        )
+
+    # All row-groups should be filtered out
+    assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
 def test_write_parquet(s3_base, s3so, pdf):
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 5100f1a9c49..bdaf5e144a5 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -14,6 +14,11 @@
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
+
+"""Map numpy dtype to pyarrow types.
+Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
+handling is required when converting a Boolean column into arrow.
+"""
 _np_pa_dtypes = {
     np.float64: pa.float64(),
     np.float32: pa.float32(),
@@ -22,7 +27,7 @@
     np.int32: pa.int32(),
     np.int16: pa.int16(),
     np.int8: pa.int8(),
-    np.bool_: pa.int8(),
+    np.bool_: pa.bool_(),
     np.uint64: pa.uint64(),
     np.uint32: pa.uint32(),
     np.uint16: pa.uint16(),
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index c2d55daabdf..a7105f3f35b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -392,6 +392,12 @@
 enable_statistics: boolean, default True
     Enable writing column statistics.
 
+
+Notes
+-----
+Support for writing tables with struct columns is currently experimental,
+the output may not be as reliable as writing for other datatypes.
+
 See Also
 --------
 cudf.read_orc
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 3abfdec74b8..6d210eff071 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -6,13 +6,24 @@
 import subprocess
 import sys
 import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
 from distutils.spawn import find_executable
 from distutils.sysconfig import get_python_lib
 
 import numpy as np
 import pyarrow as pa
-from Cython.Build import cythonize
-from Cython.Distutils import build_ext
+import setuptools.command.build_ext
 from setuptools import find_packages, setup
 from setuptools.extension import Extension
 
@@ -105,22 +116,46 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
     ),
 )
 
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
 
-cmdclass = versioneer.get_cmdclass()
+class build_ext_and_proto_no_debug(_build_ext):
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
 
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+        # Silence '-Wunknown-pragmas' warning
+        self.compiler.compiler_so.append("-Wno-unknown-pragmas")
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
 
-class build_ext_and_proto(build_ext):
-    def build_extensions(self):
-        try:
-            # Silence the '-Wstrict-prototypes' warning
-            self.compiler.compiler_so.remove("-Wstrict-prototypes")
-        except Exception:
-            pass
-        build_ext.build_extensions(self)
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
 
     def run(self):
         # Get protoc
@@ -158,11 +193,9 @@ def run(self):
                     src.write(new_src_content)
 
         # Run original Cython build_ext command
-        build_ext.run(self)
+        _build_ext.run(self)
 
 
-cmdclass["build_ext"] = build_ext_and_proto
-
 extensions = [
     Extension(
         "*",
@@ -196,6 +229,10 @@ def run(self):
     )
 ]
 
+cmdclass = versioneer.get_cmdclass()
+cmdclass["build_ext"] = build_ext_and_proto_no_debug
+
+
 setup(
     name="cudf",
     version=versioneer.get_version(),
@@ -214,13 +251,7 @@ def run(self):
     ],
     # Include the separately-compiled shared library
     setup_requires=["cython", "protobuf"],
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
+    ext_modules=extensions,
     packages=find_packages(include=["cudf", "cudf.*"]),
     package_data=dict.fromkeys(
         find_packages(include=["cudf._lib*"]), ["*.pxd"],
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index 61e4817b1c2..2f2a45dbe05 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.6.0
-distributed>=2021.6.0
+dask==2021.09.1
+distributed==2021.09.1
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 77973ee34ff..1c21fca51c8 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -37,6 +37,32 @@ def field(self, key):
             meta=self.d_series._meta._constructor([], dtype=typ),
         )
 
+    def explode(self):
+        """
+        Creates a dataframe view of the struct column, one column per field.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf, dask_cudf
+        >>> ds = dask_cudf.from_cudf(cudf.Series(
+        ...     [{'a': 42, 'b': 'str1', 'c': [-1]},
+        ...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
+        ...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
+        >>> ds.struct.explode().compute()
+            a     b           c
+        0  42  str1        [-1]
+        1   0  str2  [400, 500]
+        2   7                []
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.struct.explode(),
+            meta=self.d_series._meta.struct.explode(),
+        )
+
 
 class ListMethods:
     def __init__(self, d_series):
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 805927dd474..1521ce41806 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -499,3 +499,18 @@ def test_dask_struct_field_Int_Error(data):
 
     with pytest.raises(IndexError):
         got.struct.field(1000).compute()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [{}, {}, {}],
+        [{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}],
+        [{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}],
+    ],
+)
+def test_struct_explode(data):
+    expect = Series(data).struct.explode()
+    got = dgd.from_cudf(Series(data), 2).struct.explode()
+
+    assert_eq(expect, got.compute())
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index 0b601180711..7d41184feae 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.6.0
-distributed>=2021.6.0
+dask==2021.09.1
+distributed==2021.09.1
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index c4cb57ff89a..515469f8b6c 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.6.0",
-    "distributed>=2021.6.0",
+    "dask==2021.09.1",
+    "distributed==2021.09.1",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",
@@ -23,8 +23,8 @@
         "pandas>=1.0,<1.4.0dev0",
         "pytest",
         "numba>=0.53.1",
-        "dask>=2021.6.0",
-        "distributed>=2021.6.0",
+        "dask==2021.09.1",
+        "distributed==2021.09.1",
     ]
 }