From 0a00579350e37e9fb9c159f22c4def8c60c6f8f5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Wed, 25 Aug 2021 13:15:27 -0500
Subject: [PATCH 01/26] Remove -g from cython compile commands (#9074)

Removes `-g` from the compile commands generated by distutils to compile Cython files.

This will make our container images, conda packages, and python wheels smaller.
---
 ci/gpu/build.sh                          |  4 +-
 conda/environments/cudf_dev_cuda11.0.yml |  4 +-
 conda/environments/cudf_dev_cuda11.2.yml |  4 +-
 python/cudf/setup.py                     | 81 ++++++++++++++++--------
 4 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 8ebc85e5736..d825de2ad04 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -101,8 +101,8 @@ function install_dask {
     # Install the main version of dask, distributed, and streamz
     gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
     # Need to uninstall streamz that is already in the env.
     pip uninstall -y streamz
     pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 62b59c3f081..d8635b09f8b 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -58,7 +58,7 @@ dependencies:
   - cachetools
   - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.07.1
+      - git+https://github.com/dask/distributed.git@2021.07.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 94c7116802b..61af2f8aef1 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -58,7 +58,7 @@ dependencies:
   - cachetools
   - transformers
   - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/dask/dask.git@2021.07.1
+      - git+https://github.com/dask/distributed.git@2021.07.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 54921396b6f..e9fd3ae9d1f 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -6,13 +6,24 @@
 import subprocess
 import sys
 import sysconfig
+
+# Must import in this order:
+#   setuptools -> Cython.Distutils.build_ext -> setuptools.command.build_ext
+# Otherwise, setuptools.command.build_ext ends up inheriting from
+# Cython.Distutils.old_build_ext which we do not want
+import setuptools
+
+try:
+    from Cython.Distutils.build_ext import new_build_ext as _build_ext
+except ImportError:
+    from setuptools.command.build_ext import build_ext as _build_ext
+
 from distutils.spawn import find_executable
 from distutils.sysconfig import get_python_lib
 
 import numpy as np
 import pyarrow as pa
-from Cython.Build import cythonize
-from Cython.Distutils import build_ext
+import setuptools.command.build_ext
 from setuptools import find_packages, setup
 from setuptools.extension import Extension
 
@@ -105,22 +116,46 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
     ),
 )
 
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
 
-cmdclass = versioneer.get_cmdclass()
+class build_ext_and_proto_no_debug(_build_ext):
+    def build_extensions(self):
+        def remove_flags(compiler, *flags):
+            for flag in flags:
+                try:
+                    compiler.compiler_so = list(
+                        filter((flag).__ne__, compiler.compiler_so)
+                    )
+                except Exception:
+                    pass
 
+        # Full optimization
+        self.compiler.compiler_so.append("-O3")
+        # Silence '-Wunknown-pragmas' warning
+        self.compiler.compiler_so.append("-Wno-unknown-pragmas")
+        # No debug symbols, full optimization, no '-Wstrict-prototypes' warning
+        remove_flags(
+            self.compiler, "-g", "-G", "-O1", "-O2", "-Wstrict-prototypes"
+        )
+        super().build_extensions()
 
-class build_ext_and_proto(build_ext):
-    def build_extensions(self):
-        try:
-            # Silence the '-Wstrict-prototypes' warning
-            self.compiler.compiler_so.remove("-Wstrict-prototypes")
-        except Exception:
-            pass
-        build_ext.build_extensions(self)
+    def finalize_options(self):
+        if self.distribution.ext_modules:
+            # Delay import this to allow for Cython-less installs
+            from Cython.Build.Dependencies import cythonize
+
+            nthreads = getattr(self, "parallel", None)  # -j option in Py3.5+
+            nthreads = int(nthreads) if nthreads else None
+            self.distribution.ext_modules = cythonize(
+                self.distribution.ext_modules,
+                nthreads=nthreads,
+                force=self.force,
+                gdb_debug=False,
+                compiler_directives=dict(
+                    profile=False, language_level=3, embedsignature=True
+                ),
+            )
+        # Skip calling super() and jump straight to setuptools
+        setuptools.command.build_ext.build_ext.finalize_options(self)
 
     def run(self):
         # Get protoc
@@ -158,11 +193,9 @@ def run(self):
                     src.write(new_src_content)
 
         # Run original Cython build_ext command
-        build_ext.run(self)
+        _build_ext.run(self)
 
 
-cmdclass["build_ext"] = build_ext_and_proto
-
 extensions = [
     Extension(
         "*",
@@ -196,6 +229,10 @@ def run(self):
     )
 ]
 
+cmdclass = versioneer.get_cmdclass()
+cmdclass["build_ext"] = build_ext_and_proto_no_debug
+
+
 setup(
     name="cudf",
     version=versioneer.get_version(),
@@ -214,13 +251,7 @@ def run(self):
     ],
     # Include the separately-compiled shared library
     setup_requires=["cython", "protobuf"],
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
+    ext_modules=extensions,
     packages=find_packages(include=["cudf", "cudf.*"]),
     package_data=dict.fromkeys(
         find_packages(include=["cudf._lib*"]), ["*.pxd"],

From 079af458b55ea83d72293ddf5c2060c0b77d935f Mon Sep 17 00:00:00 2001
From: Raymond Douglass <ray@raydouglass>
Date: Thu, 16 Sep 2021 16:47:59 -0400
Subject: [PATCH 02/26] DOC v21.12 Updates

---
 CHANGELOG.md                             | 4 ++++
 conda/environments/cudf_dev_cuda11.0.yml | 2 +-
 conda/environments/cudf_dev_cuda11.2.yml | 2 +-
 cpp/CMakeLists.txt                       | 2 +-
 cpp/doxygen/Doxyfile                     | 4 ++--
 cpp/examples/basic/CMakeLists.txt        | 2 +-
 cpp/libcudf_kafka/CMakeLists.txt         | 2 +-
 docs/cudf/source/conf.py                 | 4 ++--
 java/src/main/native/CMakeLists.txt      | 2 +-
 9 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index de00213a6f6..b46ac22d767 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# cuDF 21.12.00 (Date TBD)
+
+Please see https://github.com/rapidsai/cudf/releases/tag/v21.12.00a for the latest changes to this development branch.
+
 # cuDF 21.10.00 (Date TBD)
 
 Please see https://github.com/rapidsai/cudf/releases/tag/v21.10.00a for the latest changes to this development branch.
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index bbe1ae70499..9d531c76dc8 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.10.*
+  - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index ed4c3ee2efc..9ad6985e291 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -10,7 +10,7 @@ dependencies:
   - clang=11.0.0
   - clang-tools=11.0.0
   - cupy>7.1.0,<10.0.0a0
-  - rmm=21.10.*
+  - rmm=21.12.*
   - cmake>=3.20.1
   - cmake_setuptools>=0.1.3
   - python>=3.7,<3.9
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 079db9d144b..fe81119c342 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -28,7 +28,7 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUDF)
 
-project(CUDF VERSION 21.10.00 LANGUAGES C CXX CUDA)
+project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA)
 
 # Needed because GoogleBenchmark changes the state of FindThreads.cmake,
 # causing subsequent runs to have different values for the `Threads::Threads` target.
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index 72524996a69..1141f20e3b1 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 21.10.00
+PROJECT_NUMBER         = 21.12.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.10
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/21.12
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index aef477c91e1..4175b34ff40 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -6,7 +6,7 @@ set(CPM_DOWNLOAD_VERSION v0.32.2)
 file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-21.10)
+set(CUDF_TAG branch-21.12)
 CPMFindPackage(NAME  cudf
     GIT_REPOSITORY  https://github.com/rapidsai/cudf
     GIT_TAG         ${CUDF_TAG}
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 020f5c76c10..9f060c93215 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -25,7 +25,7 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-project(CUDA_KAFKA VERSION 21.10.00 LANGUAGES CXX)
+project(CUDA_KAFKA VERSION 21.12.00 LANGUAGES CXX)
 
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c5f1233d022..4a7d115ae3b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -80,9 +80,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '21.10'
+version = '21.12'
 # The full version, including alpha/beta/rc tags.
-release = '21.10.00'
+release = '21.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index fc74ee2a3a9..3aa9f14bac4 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -29,7 +29,7 @@ if(DEFINED GPU_ARCHS)
 endif()
 rapids_cuda_init_architectures(CUDF_JNI)
 
-project(CUDF_JNI VERSION 21.10.00 LANGUAGES C CXX CUDA)
+project(CUDF_JNI VERSION 21.12.00 LANGUAGES C CXX CUDA)
 
 ###################################################################################################
 # - build options ---------------------------------------------------------------------------------

From 4defd25ba49601d8c6a8937fd7ce655574ee2858 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Mon, 20 Sep 2021 20:21:42 +0200
Subject: [PATCH 03/26] Skip dask-cudf tests on arm64 (#9252)

Temporary workaround for `arm64`
Importing cudf on arm64 CPU only nodes is currently not working due to a difference in reported gpu devices between arm64 and amd64

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/9252
---
 conda/recipes/dask-cudf/run_test.sh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
index 3fc1182b33b..f56610bea86 100644
--- a/conda/recipes/dask-cudf/run_test.sh
+++ b/conda/recipes/dask-cudf/run_test.sh
@@ -8,6 +8,15 @@ function logger() {
   echo -e "\n>>>> $@\n"
 }
 
+# Importing cudf on arm64 CPU only nodes is currently not working due to a
+# difference in reported gpu devices between arm64 and amd64
+ARCH=$(arch)
+
+if [ "${ARCH}" = "aarch64" ]; then
+  logger "Skipping tests on arm64"
+  exit 0
+fi
+
 # Install the latest version of dask and distributed
 logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
 pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps

From 625810a1417bed56a7c19ebde3a83a91e25fec1a Mon Sep 17 00:00:00 2001
From: MithunR <mythrocks@gmail.com>
Date: Mon, 20 Sep 2021 12:52:17 -0700
Subject: [PATCH 04/26] Explicitly disable groupby on unsupported key types.
 (#9227)

Fixes #8905.

Attempting groupby aggregations with `LIST` keys leads to silent
failures and bad results.
For instance, attempting hash-based `groupby` aggregations with `LIST`
keys only fails on DEBUG builds, thus:
```
/home/myth/dev/cudf/2/cpp/include/cudf/table/row_operators.cuh:447: unsigned int cudf:
:element_hasher_with_seed<hash_function, has_nulls>::operator()(cudf::column_device_view, signed in
t) const [with T = cudf::list_view; void *<anonymous> = (void *)nullptr; hash_function = default_ha
sh; __nv_bool has_nulls = false]: block: [0,0,0], thread: [0,0,0] Assertion `false && "Unsupported
type in hash."` failed.
```
In RELEASE builds, a copy of the input `LIST` column is returned, causing
each output row to be interpreted as its own group.

This commit adds an explicit failure for unsupported groupby key types,
i.e. those that don't support equality comparisons (like `LIST`).

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/cudf/pull/9227
---
 cpp/include/cudf/utilities/traits.hpp | 25 ++++++++++
 cpp/src/groupby/common/utils.hpp      |  2 +-
 cpp/src/groupby/groupby.cu            |  5 ++
 cpp/src/groupby/sort/sort_helper.cu   |  5 ++
 cpp/tests/CMakeLists.txt              |  1 +
 cpp/tests/groupby/lists_tests.cpp     | 69 +++++++++++++++++++++++++++
 cpp/tests/groupby/structs_tests.cpp   | 35 ++++++++++----
 7 files changed, 131 insertions(+), 11 deletions(-)
 create mode 100644 cpp/tests/groupby/lists_tests.cpp

diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index f4e7e3e2a6d..40a833112e1 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -142,6 +142,31 @@ constexpr inline bool is_equality_comparable()
   return detail::is_equality_comparable_impl<L, R>::value;
 }
 
+namespace detail {
+/**
+ * @brief Helper functor to check if a specified type `T` supports equality comparisons.
+ */
+struct unary_equality_comparable_functor {
+  template <typename T>
+  bool operator()() const
+  {
+    return cudf::is_equality_comparable<T, T>();
+  }
+};
+}  // namespace detail
+
+/**
+ * @brief Checks whether `data_type` `type` supports equality comparisons.
+ *
+ * @param type Data_type for comparison.
+ * @return true If `type` supports equality comparisons.
+ * @return false If `type` does not support equality comparisons.
+ */
+inline bool is_equality_comparable(data_type type)
+{
+  return cudf::type_dispatcher(type, detail::unary_equality_comparable_functor{});
+}
+
 /**
  * @brief Indicates whether the type `T` is a numeric type.
  *
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 3da20fb9af3..2804dea576e 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-20, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 533f193d692..bdaccba38dc 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -27,10 +27,12 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -76,6 +78,9 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
     // Optionally flatten nested key columns.
     auto [flattened_keys, _, __, ___] =
       flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
+    auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
+    CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
+                 "Unsupported groupby key type does not support equality comparison");
     auto [grouped_keys, results] =
       detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr);
     return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys),
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 69d68f7b6bc..c4905b86ab9 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -23,8 +23,10 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
 #include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -102,6 +104,9 @@ sort_groupby_helper::sort_groupby_helper(table_view const& keys,
 
   auto [flattened_keys, _, __, struct_null_vectors] =
     flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
+  auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
+  CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
+               "Unsupported groupby key type does not support equality comparison");
   _struct_null_vectors = std::move(struct_null_vectors);
   _keys                = flattened_keys;
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d9553d463ab..03f7967cee0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -62,6 +62,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/count_tests.cpp
     groupby/groups_tests.cpp
     groupby/keys_tests.cpp
+    groupby/lists_tests.cpp
     groupby/m2_tests.cpp
     groupby/min_tests.cpp
     groupby/max_scan_tests.cpp
diff --git a/cpp/tests/groupby/lists_tests.cpp b/cpp/tests/groupby/lists_tests.cpp
new file mode 100644
index 00000000000..11b8ffa92b9
--- /dev/null
+++ b/cpp/tests/groupby/lists_tests.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+
+namespace cudf {
+namespace test {
+
+template <typename V>
+struct groupby_lists_test : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_SUITE(groupby_lists_test, cudf::test::FixedWidthTypes);
+
+namespace {
+// Checking with a single aggregation, and aggregation column.
+// This test is orthogonal to the aggregation type; it focuses on testing the grouping
+// with LISTS keys.
+auto sum_agg() { return cudf::make_sum_aggregation<groupby_aggregation>(); }
+
+void test_sort_based_sum_agg(column_view const& keys, column_view const& values)
+{
+  test_single_agg(
+    keys, values, keys, values, sum_agg(), force_use_sort_impl::YES, null_policy::INCLUDE);
+}
+
+void test_hash_based_sum_agg(column_view const& keys, column_view const& values)
+{
+  test_single_agg(
+    keys, values, keys, values, sum_agg(), force_use_sort_impl::NO, null_policy::INCLUDE);
+}
+
+}  // namespace
+
+TYPED_TEST(groupby_lists_test, top_level_lists_are_unsupported)
+{
+  // Test that grouping on LISTS columns fails visibly.
+
+  // clang-format off
+  auto keys   = lists_column_wrapper<TypeParam, int32_t> { {1,1},  {2,2},  {3,3},   {1,1},   {2,2} };
+  auto values = fixed_width_column_wrapper<int32_t>      {     0,      1,      2,      3,       4  };
+  // clang-format on
+
+  EXPECT_THROW(test_sort_based_sum_agg(keys, values), cudf::logic_error);
+  EXPECT_THROW(test_hash_based_sum_agg(keys, values), cudf::logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/structs_tests.cpp b/cpp/tests/groupby/structs_tests.cpp
index 00126a4a5a0..3715ba8d17b 100644
--- a/cpp/tests/groupby/structs_tests.cpp
+++ b/cpp/tests/groupby/structs_tests.cpp
@@ -22,8 +22,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include "cudf/aggregation.hpp"
-#include "cudf/types.hpp"
 
 using namespace cudf::test::iterators;
 
@@ -34,7 +32,7 @@ template <typename V>
 struct groupby_structs_test : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_structs_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_structs_test, cudf::test::FixedWidthTypes);
 
 using V       = int32_t;  // Type of Aggregation Column.
 using M0      = int32_t;  // Type of STRUCT's first (i.e. 0th) member.
@@ -79,27 +77,43 @@ void print_agg_results(column_view const& keys, column_view const& vals)
   }
 }
 
-void test_sum_agg(column_view const& keys,
-                  column_view const& values,
-                  column_view const& expected_keys,
-                  column_view const& expected_values)
+void test_sort_based_sum_agg(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expected_keys,
+                             column_view const& expected_values)
 {
   test_single_agg(keys,
                   values,
                   expected_keys,
                   expected_values,
                   sum_agg(),
-                  force_use_sort_impl::NO,
+                  force_use_sort_impl::YES,
                   null_policy::INCLUDE);
+}
+
+void test_hash_based_sum_agg(column_view const& keys,
+                             column_view const& values,
+                             column_view const& expected_keys,
+                             column_view const& expected_values)
+{
   test_single_agg(keys,
                   values,
                   expected_keys,
                   expected_values,
                   sum_agg(),
-                  force_use_sort_impl::YES,
+                  force_use_sort_impl::NO,
                   null_policy::INCLUDE);
 }
 
+void test_sum_agg(column_view const& keys,
+                  column_view const& values,
+                  column_view const& expected_keys,
+                  column_view const& expected_values)
+{
+  test_sort_based_sum_agg(keys, values, expected_keys, expected_values);
+  test_hash_based_sum_agg(keys, values, expected_keys, expected_values);
+}
+
 }  // namespace
 
 TYPED_TEST(groupby_structs_test, basic)
@@ -312,7 +326,8 @@ TYPED_TEST(groupby_structs_test, lists_are_unsupported)
   // clang-format on
   auto keys = structs{{member_0, member_1}};
 
-  EXPECT_THROW(test_sum_agg(keys, values, keys, values), cudf::logic_error);
+  EXPECT_THROW(test_sort_based_sum_agg(keys, values, keys, values), cudf::logic_error);
+  EXPECT_THROW(test_hash_based_sum_agg(keys, values, keys, values), cudf::logic_error);
 }
 
 }  // namespace test

From 1fdd62f4f593512addf7d98a07650fd2aab02021 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 Sep 2021 18:48:17 -0500
Subject: [PATCH 05/26] Fix duplicate names issue in `MultiIndex.deserialize `
 (#9258)

Fixes: #9254

This PR fixes `deserialize` in `cudf.MultiIndex` so that there is no data-corruption happening when there are duplicate names.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/9258
---
 python/cudf/cudf/core/multiindex.py       |  6 ++---
 python/cudf/cudf/tests/test_multiindex.py | 31 +++++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 84566b4627c..fba857694e8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -977,10 +977,10 @@ def deserialize(cls, header, frames):
             )
             df = cudf.DataFrame.deserialize(header["source_data"], frames)
             obj = cls.from_frame(df)
-            obj._set_names(names)
-            return obj
+            return obj._set_names(names)
         columns = column.deserialize_columns(header["columns"], frames)
-        return cls._from_data(dict(zip(names, columns)))
+        obj = cls._from_data(dict(zip(range(0, len(names)), columns)))
+        return obj._set_names(names)
 
     def __getitem__(self, index):
         match = self.take(index)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 465cf36e1f3..981ab8b63b9 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -5,7 +5,9 @@
 """
 import itertools
 import operator
+import pickle
 import re
+from io import BytesIO
 
 import cupy as cp
 import numpy as np
@@ -1553,3 +1555,32 @@ def test_multiIndex_duplicate_names():
     )
 
     assert_eq(gi, pi)
+
+
+@pytest.mark.parametrize(
+    "names",
+    [
+        ["a", "b", "c"],
+        [None, None, None],
+        ["aa", "aa", "aa"],
+        ["bb", "aa", "aa"],
+        None,
+    ],
+)
+def test_pickle_rountrip_multiIndex(names):
+    df = cudf.DataFrame(
+        {
+            "one": [1, 2, 3],
+            "two": [True, False, True],
+            "three": ["ab", "cd", "ef"],
+            "four": [0.2, 0.1, -10.2],
+        }
+    )
+    expected_df = df.set_index(["one", "two", "three"])
+    expected_df.index.names = names
+    local_file = BytesIO()
+
+    pickle.dump(expected_df, local_file)
+    local_file.seek(0)
+    actual_df = pickle.load(local_file)
+    assert_eq(expected_df, actual_df)

From ba2cbd91348b18e1685b6149efe73e3f2aeef9ec Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Mon, 20 Sep 2021 18:48:51 -0600
Subject: [PATCH 06/26] Add struct type support for `drop_list_duplicates`
 (#9202)

This PR add support for struct type into the existing `drop_list_duplicates` API. This is the first time a nested type is supported in this function. Some more code cleanup has also been done.

To be clear: Only structs of basic types and structs of structs are supported. Structs of lists are not, due to their complex nature.

Closes #8972.
Blocked by https://github.com/rapidsai/cudf/pull/9218 (it is merged).

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9202
---
 .../cudf/lists/drop_list_duplicates.hpp       |  28 +-
 cpp/src/lists/drop_list_duplicates.cu         | 548 ++++++++++------
 .../lists/drop_list_duplicates_tests.cpp      | 597 ++++++++++++++----
 cpp/tests/rolling/collect_ops_test.cpp        |  45 +-
 4 files changed, 875 insertions(+), 343 deletions(-)

diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
index f1ce3b7f0e3..e778428510d 100644
--- a/cpp/include/cudf/lists/drop_list_duplicates.hpp
+++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp
@@ -28,32 +28,32 @@ namespace lists {
  */
 
 /**
- * @brief Create a new lists column by removing duplicated entries from each list element in the
- * given lists column
+ * @brief Create a new lists column by extracting unique entries from list elements in the given
+ * lists column.
  *
- * @throw cudf::logic_error if any row (list element) in the input column is a nested type.
- *
- * Given an `input` lists_column_view, the list elements in the column are copied to an output lists
+ * Given an input lists column, the list elements in the column are copied to an output lists
  * column such that their duplicated entries are dropped out to keep only the unique ones. The
  * order of those entries within each list are not guaranteed to be preserved as in the input. In
  * the current implementation, entries in the output lists are sorted by ascending order (nulls
  * last), but this is not guaranteed in future implementation.
  *
- * @param lists_column The input lists_column_view
- * @param nulls_equal  Flag to specify whether null entries should be considered equal
- * @param nans_equal   Flag to specify whether NaN entries should be considered as equal value (only
- * applicable for floating point data column)
- * @param mr           Device resource used to allocate memory
+ * @throw cudf::logic_error if the child column of the input lists column contains nested type other
+ * than struct.
+ *
+ * @param lists_column The input lists column to extract lists with unique entries.
+ * @param nulls_equal Flag to specify whether null entries should be considered equal.
+ * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only
+ *        applicable for floating point data column).
+ * @param mr Device resource used to allocate memory.
  *
  * @code{.pseudo}
- * lists_column = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
+ * input  = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
  * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
  *
- * Note that permuting the entries of each list in this output also produces another valid
- * output.
+ * Note that permuting the entries of each list in this output also produces another valid output.
  * @endcode
  *
- * @return A list column with list elements having unique entries
+ * @return A lists column with list elements having unique entries.
  */
 std::unique_ptr<column> drop_list_duplicates(
   lists_column_view const& lists_column,
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index 564d919b65d..e53ae4ff0c1 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <structs/utilities.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
@@ -22,6 +24,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/detail/sorting.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,10 +40,15 @@ namespace lists {
 namespace detail {
 namespace {
 template <typename Type>
-struct has_negative_nans {
+struct has_negative_nans_fn {
   column_device_view const d_entries;
   bool const has_nulls;
 
+  has_negative_nans_fn(column_device_view const d_entries, bool const has_nulls)
+    : d_entries(d_entries), has_nulls(has_nulls)
+  {
+  }
+
   __device__ Type operator()(size_type idx) const noexcept
   {
     if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; }
@@ -50,30 +59,53 @@ struct has_negative_nans {
 };
 
 /**
- * @brief A structure to be used along with type_dispatcher to check if a
- * `column_view` has any negative NaN entry
+ * @brief A structure to be used along with type_dispatcher to check if a column has any
+ * negative NaN value.
+ *
+ * This functor is used to check for replacing negative NaN if there exists one. It is neccessary
+ * because when calling to `lists::detail::sort_lists`, the negative NaN and positive NaN values (if
+ * both exist) are separated to the two ends of the output column. This is due to the API
+ * `lists::detail::sort_lists` internally calls `cub::DeviceSegmentedRadixSort`, which performs
+ * sorting by comparing bits of the input numbers. Since negative and positive NaN have
+ * different bits representation, they may not be moved to be close to each other after sorted.
  */
-struct has_negative_nans_fn {
+struct has_negative_nans_dispatch {
   template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
   bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept
   {
     auto const d_entries = column_device_view::create(lists_entries, stream);
-    return thrust::count_if(rmm::exec_policy(stream),
-                            thrust::make_counting_iterator(0),
-                            thrust::make_counting_iterator(lists_entries.size()),
-                            detail::has_negative_nans<Type>{*d_entries, lists_entries.has_nulls()});
+    return thrust::count_if(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.size()),
+      detail::has_negative_nans_fn<Type>{*d_entries, lists_entries.has_nulls()});
   }
 
-  template <typename Type, std::enable_if_t<not cuda::std::is_floating_point_v<Type>>* = nullptr>
-  bool operator()(column_view const&, rmm::cuda_stream_view) const noexcept
+  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const
   {
-    // Columns of non floating-point data will never contain NaN
+    // Recursively check negative NaN on the children columns.
+    return std::any_of(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.num_children()),
+      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
+        auto const col = structs_view.get_sliced_child(child_idx);
+        return type_dispatcher(col.type(), detail::has_negative_nans_dispatch{}, col, stream);
+      });
+  }
+
+  template <typename Type,
+            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  bool operator()(column_view const&, rmm::cuda_stream_view) const
+  {
+    // Columns of non floating-point data will never contain NaN.
     return false;
   }
 };
 
 template <typename Type>
-struct replace_negative_nans {
+struct replace_negative_nans_fn {
   __device__ Type operator()(Type val) const noexcept
   {
     return std::isnan(val) ? std::numeric_limits<Type>::quiet_NaN() : val;
@@ -81,58 +113,63 @@ struct replace_negative_nans {
 };
 
 /**
- * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all entries
- * of a floating-point data column
+ * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all rows
+ * in a floating-point data column.
  */
-struct replace_negative_nans_fn {
-  template <typename Type, std::enable_if_t<not cuda::std::is_floating_point_v<Type>>* = nullptr>
-  void operator()(column_view const&, mutable_column_view const&, rmm::cuda_stream_view) const
+struct replace_negative_nans_dispatch {
+  template <typename Type,
+            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view) const noexcept
   {
-    CUDF_FAIL("Cannot operate on a type that is not floating-point.");
+    // For non floating point type and non struct, just return a copy of the input.
+    return std::make_unique<column>(lists_entries);
   }
 
   template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
-  void operator()(column_view const& lists_entries,
-                  mutable_column_view const& new_entries,
-                  rmm::cuda_stream_view stream) const noexcept
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view stream) const noexcept
   {
-    // Do not care whether an entry is null or not, just consider it as a floating-point value
-    thrust::transform(rmm::exec_policy(stream),
-                      lists_entries.begin<Type>(),
-                      lists_entries.end<Type>(),
-                      new_entries.begin<Type>(),
-                      detail::replace_negative_nans<Type>{});
-  }
-};
+    auto new_entries = cudf::detail::allocate_like(
+      lists_entries, lists_entries.size(), cudf::mask_allocation_policy::NEVER, stream);
+    new_entries->set_null_mask(cudf::detail::copy_bitmask(lists_entries, stream),
+                               lists_entries.null_count());
 
-/**
- * @brief Transform a given lists column to a new lists column in which all the list entries holding
- * -NaN value are replaced by (positive) NaN
- */
-std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_entries,
-                                                      lists_column_view const& lists_column,
-                                                      rmm::cuda_stream_view stream)
-{
-  auto new_offsets = std::make_unique<column>(lists_column.offsets());
-  auto new_entries = std::make_unique<column>(lists_entries);
+    // Replace all negative NaN values.
+    thrust::transform(rmm::exec_policy(stream),
+                      lists_entries.template begin<Type>(),
+                      lists_entries.template end<Type>(),
+                      new_entries->mutable_view().template begin<Type>(),
+                      detail::replace_negative_nans_fn<Type>{});
 
-  type_dispatcher(lists_entries.type(),
-                  detail::replace_negative_nans_fn{},
-                  lists_entries,
-                  new_entries->mutable_view(),
-                  stream);
+    return new_entries;
+  }
 
-  return make_lists_column(
-    lists_column.size(),
-    std::move(new_offsets),
-    std::move(new_entries),
-    lists_column.null_count(),
-    cudf::detail::copy_bitmask(
-      lists_column.parent(), stream, rmm::mr::get_current_device_resource()));
-}
+  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& lists_entries,
+                                     rmm::cuda_stream_view stream) const noexcept
+  {
+    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
+    std::transform(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(lists_entries.num_children()),
+      std::back_inserter(output_struct_members),
+      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
+        auto const col = structs_view.get_sliced_child(child_idx);
+        return type_dispatcher(col.type(), detail::replace_negative_nans_dispatch{}, col, stream);
+      });
+
+    return cudf::make_structs_column(lists_entries.size(),
+                                     std::move(output_struct_members),
+                                     lists_entries.null_count(),
+                                     cudf::detail::copy_bitmask(lists_entries, stream),
+                                     stream);
+  }
+};
 
 /**
- * @brief Generate a 0-based offset column for a lists column
+ * @brief Generate a 0-based offset column for a lists column.
  *
  * Given a lists_column_view, which may have a non-zero offset, generate a new column containing
  * 0-based list offsets. This is done by subtracting each of the input list offset by the first
@@ -143,11 +180,10 @@ std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_e
  * then output_offsets = { 0, 4, 6, 10 }
  * @endcode
  *
- * @param lists_column The input lists column
- * @param stream       CUDA stream used for device memory operations and kernel launches
- * @param mr           Device resource used to allocate memory
- *
- * @return A column containing 0-based list offsets
+ * @param lists_column The input lists column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
+ * @return A column containing 0-based list offsets.
  */
 std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_column,
                                                rmm::cuda_stream_view stream,
@@ -168,7 +204,35 @@ std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_co
 }
 
 /**
- * @brief Populate list offsets for all list entries
+ * @brief Transform a given lists column to a new lists column in which all the list entries holding
+ * -NaN value are replaced by (positive) NaN.
+ *
+ * Replacing -NaN by NaN is necessary before sorting (individual) lists because the sorting API is
+ * using radix sort, which compares bits of the number thus it may separate -NaN by NaN to the two
+ * ends of the result column.
+ */
+std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_entries,
+                                                      lists_column_view const& lists_column,
+                                                      rmm::cuda_stream_view stream)
+{
+  // We need to copy the offsets column of the input lists_column. Since the input lists_column may
+  // be sliced, we need to generate clean offsets (i.e., offsets starting from zero).
+  auto new_offsets =
+    generate_clean_offsets(lists_column, stream, rmm::mr::get_current_device_resource());
+  auto new_entries = type_dispatcher(
+    lists_entries.type(), detail::replace_negative_nans_dispatch{}, lists_entries, stream);
+
+  return make_lists_column(
+    lists_column.size(),
+    std::move(new_offsets),
+    std::move(new_entries),
+    lists_column.null_count(),
+    cudf::detail::copy_bitmask(
+      lists_column.parent(), stream, rmm::mr::get_current_device_resource()));
+}
+
+/**
+ * @brief Populate list offsets for all list entries.
  *
  * Given an `offsets` column_view containing offsets of a lists column and a number of all list
  * entries in the column, generate an array that maps from each list entry to the offset of the list
@@ -179,12 +243,11 @@ std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_co
  * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 }
  * @endcode
  *
- * @param num_entries The number of list entries
- * @param offsets     Column view to the list offsets
- * @param stream      CUDA stream used for device memory operations and kernel launches
- * @param mr          Device resource used to allocate memory
- *
- * @return A column containing entry list offsets
+ * @param num_entries The number of list entries.
+ * @param offsets Column view to the list offsets.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
+ * @return A column containing entry list offsets.
  */
 std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
                                                     column_view const& offsets,
@@ -205,95 +268,172 @@ std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
 }
 
 /**
- * @brief Performs an equality comparison between two entries in a lists column
+ * @brief Performs an equality comparison between two entries in a lists column.
  *
- * For the two elements that are in the same list in the lists column, they will always be
- * considered as different. If they are from the same list and their type is one of floating
- * point types, this functor will return the same comparison result as
- * `cudf::element_equality_comparator`.
+ * For the two elements that are NOT in the same list in the lists column, they will always be
+ * considered as different. If they are from the same list and their type is not floating point,
+ * this functor will return the same comparison result as `cudf::element_equality_comparator`.
  *
  * For floating-point types, entries holding NaN value can be considered as different values or the
- * same value depending on the nans_equal parameter.
+ * same value depending on the `nans_equal` parameter.
  *
- * @tparam Type       The data type of entries
+ * @tparam Type The data type of entries
  * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only
  * applicable for floating-point data column)
  */
-template <class Type, bool nans_equal>
-class list_entry_comparator {
- public:
-  list_entry_comparator(offset_type const* list_offsets,
-                        column_device_view d_view,
-                        null_equality nulls_equal,
-                        bool has_nulls)
-    : list_offsets(list_offsets), d_view{d_view}, nulls_equal{nulls_equal}, has_nulls(has_nulls)
+template <class Type>
+struct column_row_comparator_fn {
+  offset_type const* const list_offsets;
+  column_device_view const lhs;
+  column_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  __host__ __device__ column_row_comparator_fn(offset_type const* const list_offsets,
+                                               column_device_view const& lhs,
+                                               column_device_view const& rhs,
+                                               null_equality const nulls_equal,
+                                               bool const has_nulls,
+                                               bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
   {
   }
 
-  template <bool nans_equal_ = nans_equal>
-  std::enable_if_t<cuda::std::is_floating_point_v<Type> and nans_equal_, bool> __device__
-  operator()(size_type i, size_type j) const noexcept
+  template <typename T, std::enable_if_t<!cuda::std::is_floating_point_v<T>>* = nullptr>
+  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
   {
-    // Two entries are not considered for equality if they belong to different lists
-    if (list_offsets[i] != list_offsets[j]) { return false; }
+    return lhs_val == rhs_val;
+  }
 
-    if (has_nulls) {
-      bool const nullable = d_view.nullable();
-      bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)};
-      bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)};
-      if (lhs_is_null and rhs_is_null) {
-        return nulls_equal == null_equality::EQUAL;
-      } else if (lhs_is_null != rhs_is_null) {
-        return false;
-      }
-    }
+  template <typename T, std::enable_if_t<cuda::std::is_floating_point_v<T>>* = nullptr>
+  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
+  {
+    // If both element(i) and element(j) are NaNs and nans are considered as equal value then this
+    // comparison will return `true`. This is the desired behavior in Pandas.
+    if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; }
 
-    // For floating-point types, if both element(i) and element(j) are NaNs then this comparison
-    // will return `true`. This is the desired behavior in Pandas.
-    auto const lhs = d_view.element<Type>(i);
-    auto const rhs = d_view.element<Type>(j);
-    if (std::isnan(lhs) and std::isnan(rhs)) { return true; }
-    return lhs == rhs;
+    // If nans are considered as NOT equal, even both element(i) and element(j) are NaNs this
+    // comparison will still return `false`. This is the desired behavior in Apache Spark.
+    return lhs_val == rhs_val;
   }
 
-  template <bool nans_equal_ = nans_equal>
-  std::enable_if_t<not cuda::std::is_floating_point_v<Type> or not nans_equal_, bool> __device__
-  operator()(size_type i, size_type j) const noexcept
+  bool __device__ operator()(size_type i, size_type j) const noexcept
   {
-    // Two entries are not considered for equality if they belong to different lists
+    // Two entries are not considered for equality if they belong to different lists.
     if (list_offsets[i] != list_offsets[j]) { return false; }
 
     if (has_nulls) {
-      bool const nullable = d_view.nullable();
-      bool const lhs_is_null{nullable and d_view.is_null_nocheck(i)};
-      bool const rhs_is_null{nullable and d_view.is_null_nocheck(j)};
-      if (lhs_is_null and rhs_is_null) {
+      bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)};
+      bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)};
+      if (lhs_is_null && rhs_is_null) {
         return nulls_equal == null_equality::EQUAL;
       } else if (lhs_is_null != rhs_is_null) {
         return false;
       }
     }
 
-    // For floating-point types, if both element(i) and element(j) are NaNs then this comparison
-    // will return `false`. This is the desired behavior in Apache Spark.
-    return d_view.element<Type>(i) == d_view.element<Type>(j);
+    return compare<Type>(lhs.element<Type>(i), lhs.element<Type>(j));
+  }
+};
+
+/**
+ * @brief Struct used in type_dispatcher for comparing two entries in a lists column.
+ */
+struct column_row_comparator_dispatch {
+  offset_type const* const list_offsets;
+  column_device_view const lhs;
+  column_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  __device__ column_row_comparator_dispatch(offset_type const* const list_offsets,
+                                            column_device_view const& lhs,
+                                            column_device_view const& rhs,
+                                            null_equality const nulls_equal,
+                                            bool const has_nulls,
+                                            bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
+  {
+  }
+
+  template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+  bool __device__ operator()(size_type i, size_type j) const noexcept
+  {
+    return column_row_comparator_fn<Type>{
+      list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j);
+  }
+
+  template <class Type, std::enable_if_t<!cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+  bool operator()(size_type i, size_type j) const
+  {
+    CUDF_FAIL(
+      "`column_row_comparator_dispatch` cannot operate on types that are not equally comparable.");
   }
+};
 
- private:
-  offset_type const* list_offsets;
-  column_device_view d_view;
-  null_equality nulls_equal;
-  bool has_nulls;
+/**
+ * @brief Performs an equality comparison between rows of two tables using `column_row_comparator`
+ * to compare rows of their corresponding columns.
+ */
+struct table_row_comparator_fn {
+  offset_type const* const list_offsets;
+  table_device_view const lhs;
+  table_device_view const rhs;
+  null_equality const nulls_equal;
+  bool const has_nulls;
+  bool const nans_equal;
+
+  table_row_comparator_fn(offset_type const* const list_offsets,
+                          table_device_view const& lhs,
+                          table_device_view const& rhs,
+                          null_equality const nulls_equal,
+                          bool const has_nulls,
+                          bool const nans_equal)
+    : list_offsets(list_offsets),
+      lhs(lhs),
+      rhs(rhs),
+      nulls_equal(nulls_equal),
+      has_nulls(has_nulls),
+      nans_equal(nans_equal)
+  {
+  }
+
+  bool __device__ operator()(size_type i, size_type j) const noexcept
+  {
+    auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) {
+      return type_dispatcher(
+        lhs.type(),
+        column_row_comparator_dispatch{list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal},
+        i,
+        j);
+    };
+
+    return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp);
+  }
 };
 
 /**
- *  @brief Construct type-dispatched function object for copying indices of the list entries
- * ignoring duplicates
+ *  @brief Struct used in type_dispatcher for copying indices of the list entries ignoring
+ * duplicates.
  */
-struct get_unique_entries_fn {
-  template <class Type, std::enable_if_t<not cudf::is_equality_comparable<Type, Type>()>* = nullptr>
+struct get_unique_entries_dispatch {
+  template <class Type,
+            std::enable_if_t<!cudf::is_equality_comparable<Type, Type>() &&
+                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
   offset_type* operator()(offset_type const*,
-                          column_device_view&,
+                          column_view const&,
                           size_type,
                           offset_type*,
                           null_equality,
@@ -301,12 +441,13 @@ struct get_unique_entries_fn {
                           bool,
                           rmm::cuda_stream_view) const
   {
-    CUDF_FAIL("Cannot operate on types that are not equally comparable.");
+    CUDF_FAIL(
+      "`get_unique_entries_dispatch` cannot operate on types that are not equally comparable.");
   }
 
   template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
   offset_type* operator()(offset_type const* list_offsets,
-                          column_device_view& d_view,
+                          column_view const& all_lists_entries,
                           size_type num_entries,
                           offset_type* output_begin,
                           null_equality nulls_equal,
@@ -314,41 +455,69 @@ struct get_unique_entries_fn {
                           bool has_nulls,
                           rmm::cuda_stream_view stream) const noexcept
   {
-    if (nans_equal == nan_equality::ALL_EQUAL) {
-      list_entry_comparator<Type, true> const comp{list_offsets, d_view, nulls_equal, has_nulls};
-      return thrust::unique_copy(rmm::exec_policy(stream),
-                                 thrust::make_counting_iterator(0),
-                                 thrust::make_counting_iterator(num_entries),
-                                 output_begin,
-                                 comp);
-    } else {
-      list_entry_comparator<Type, false> const comp{list_offsets, d_view, nulls_equal, has_nulls};
-      return thrust::unique_copy(rmm::exec_policy(stream),
-                                 thrust::make_counting_iterator(0),
-                                 thrust::make_counting_iterator(num_entries),
-                                 output_begin,
-                                 comp);
-    }
+    auto const d_view = column_device_view::create(all_lists_entries, stream);
+    auto const comp   = column_row_comparator_fn<Type>{list_offsets,
+                                                     *d_view,
+                                                     *d_view,
+                                                     nulls_equal,
+                                                     has_nulls,
+                                                     nans_equal == nan_equality::ALL_EQUAL};
+    return thrust::unique_copy(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(num_entries),
+                               output_begin,
+                               comp);
+  }
+
+  template <class Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  offset_type* operator()(offset_type const* list_offsets,
+                          column_view const& all_lists_entries,
+                          size_type num_entries,
+                          offset_type* output_begin,
+                          null_equality nulls_equal,
+                          nan_equality nans_equal,
+                          bool has_nulls,
+                          rmm::cuda_stream_view stream) const noexcept
+  {
+    auto const entries_tview       = table_view{{all_lists_entries}};
+    auto const flatten_nullability = has_nested_nulls(entries_tview)
+                                       ? structs::detail::column_nullability::FORCE
+                                       : structs::detail::column_nullability::MATCH_INCOMING;
+    auto const entries_flattened   = cudf::structs::detail::flatten_nested_columns(
+      entries_tview, {order::ASCENDING}, {null_order::AFTER}, flatten_nullability);
+    auto const d_view = table_device_view::create(std::get<0>(entries_flattened), stream);
+
+    auto const comp = table_row_comparator_fn{list_offsets,
+                                              *d_view,
+                                              *d_view,
+                                              nulls_equal,
+                                              has_nulls,
+                                              nans_equal == nan_equality::ALL_EQUAL};
+
+    return thrust::unique_copy(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(num_entries),
+                               output_begin,
+                               comp);
   }
 };
 
 /**
- * @brief Copy list entries and entry list offsets ignoring duplicates
+ * @brief Copy list entries and entry list offsets ignoring duplicates.
  *
  * Given an array of all entries flattened from a list column and an array that maps each entry to
  * the offset of the list containing that entry, those entries and list offsets are copied into
  * new arrays such that the duplicated entries within each list will be ignored.
  *
- * @param all_lists_entries    The input array containing all list entries
- * @param entries_list_offsets A map from list entries to their corresponding list offsets
- * @param nulls_equal          Flag to specify whether null entries should be considered equal
- * @param nans_equal           Flag to specify whether NaN entries should be considered as equal
- * value (only applicable for floating-point data column)
- * @param stream               CUDA stream used for device memory operations and kernel launches
- * @param mr                   Device resource used to allocate memory
- *
+ * @param all_lists_entries The input array containing all list entries.
+ * @param entries_list_offsets A map from list entries to their corresponding list offsets.
+ * @param nulls_equal Flag to specify whether null entries should be considered equal.
+ * @param nans_equal Flag to specify whether NaN entries should be considered equal
+ *        (only applicable for floating-point data column).
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
  * @return A pair of columns, the first one contains unique list entries and the second one
- * contains their corresponding list offsets
+ *         contains their corresponding list offsets.
  */
 std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
   column_view const& all_lists_entries,
@@ -358,16 +527,15 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const num_entries    = all_lists_entries.size();
-  auto const d_view_entries = column_device_view::create(all_lists_entries, stream);
+  auto const num_entries = all_lists_entries.size();
 
-  // Allocate memory to store the indices of the unique entries
+  // Allocate memory to store the indices of the unique entries.
   auto unique_indices     = rmm::device_uvector<offset_type>(num_entries, stream);
   auto const output_begin = unique_indices.begin();
   auto const output_end   = type_dispatcher(all_lists_entries.type(),
-                                          get_unique_entries_fn{},
+                                          get_unique_entries_dispatch{},
                                           entries_list_offsets.begin<offset_type>(),
-                                          *d_view_entries,
+                                          all_lists_entries,
                                           num_entries,
                                           output_begin,
                                           nulls_equal,
@@ -375,9 +543,9 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
                                           all_lists_entries.has_nulls(),
                                           stream);
 
-  // Collect unique entries and entry list offsets
+  // Collect unique entries and entry list offsets.
   // The new null_count and bitmask of the unique entries will also be generated
-  // by the gather function
+  // by the gather function.
   return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}},
                               output_begin,
                               output_end,
@@ -388,27 +556,27 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
 }
 
 /**
- * @brief Generate list offsets from entry offsets
+ * @brief Generate list offsets from entry offsets.
  *
- * Generate an array of list offsets for the final result lists column. The list
- * offsets of the original lists column are also taken into account to make sure the result lists
- * column will have the same empty list rows (if any) as in the original lists column.
+ * Generate an array of list offsets for the final result lists column. The list offsets of the
+ * original lists column are also taken into account to make sure the result lists column will have
+ * the same empty list rows (if any) as in the original lists column.
  *
- * @param[in] num_entries          The number of unique entries after removing duplicates
- * @param[in] entries_list_offsets The mapping from list entries to their list offsets
- * @param[out] original_offsets    The list offsets of the original lists column, which
- * will also be used to store the new list offsets
- * @param[in] stream               CUDA stream used for device memory operations and kernel launches
- * @param[in] mr                   Device resource used to allocate memory
+ * @param num_entries The number of unique entries after removing duplicates.
+ * @param entries_list_offsets The mapping from list entries to their list offsets.
+ * @param original_offsets The list offsets of the original lists column, which will also be used to
+ *        store the new list offsets.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device resource used to allocate memory.
  */
 void generate_offsets(size_type num_entries,
                       column_view const& entries_list_offsets,
                       mutable_column_view const& original_offsets,
                       rmm::cuda_stream_view stream)
 {
-  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any)
+  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any).
   // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10,
-  // then new_offsets = { 0, 4, 5, 8, 10 }
+  // then new_offsets = { 0, 4, 5, 8, 10 }.
   auto const new_offsets = allocate_like(
     original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource());
   thrust::copy_if(rmm::exec_policy(stream),
@@ -421,10 +589,9 @@ void generate_offsets(size_type num_entries,
                   });
 
   // Generate a prefix sum of number of empty lists, storing inplace to the original lists
-  // offsets
+  // offsets.
   // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists),
-  // and new_offsets = { 0, 4, 6 },
-  // then output = { 0, 1, 1, 2, 2, 3}
+  // and new_offsets = { 0, 4, 6 }, then output = { 0, 1, 1, 2, 2, 3}.
   auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator(
     0, [offsets = original_offsets.begin<offset_type>()] __device__(auto i) {
       return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0;
@@ -434,10 +601,10 @@ void generate_offsets(size_type num_entries,
                          iter_trans_begin + original_offsets.size(),
                          original_offsets.begin<offset_type>());
 
-  // Generate the final list offsets
+  // Generate the final list offsets.
   // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 },
-  //  and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
-  //  then output = { 0, 0, 4, 4, 5, 5 }
+  // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
+  // then output = { 0, 0, 4, 4, 5, 5 }.
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<offset_type>(0),
                     thrust::make_counting_iterator<offset_type>(original_offsets.size()),
@@ -453,7 +620,7 @@ void generate_offsets(size_type num_entries,
 /**
  * @copydoc cudf::lists::drop_list_duplicates
  *
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
                                              null_equality nulls_equal,
@@ -462,22 +629,23 @@ std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_colu
                                              rmm::mr::device_memory_resource* mr)
 {
   if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent());
-  if (cudf::is_nested(lists_column.child().type())) {
-    CUDF_FAIL("Nested types are not supported in drop_list_duplicates.");
+  if (auto const child_type = lists_column.child().type();
+      cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) {
+    CUDF_FAIL("Nested types other than STRUCT are not supported in `drop_list_duplicates`.");
   }
 
-  // Flatten all entries (depth = 1) of the lists column
+  // Flatten all entries (depth = 1) of the lists column.
   auto const lists_entries = lists_column.get_sliced_child(stream);
 
-  // sorted_lists will store the results of the original lists after calling segmented_sort
+  // sorted_lists will store the results of the original lists after calling segmented_sort.
   auto const sorted_lists = [&]() {
     // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type,
-    // we need to replace -NaN by NaN before sorting
+    // we need to replace -NaN by NaN before sorting.
     auto const replace_negative_nan =
-      nans_equal == nan_equality::ALL_EQUAL and
-      type_dispatcher(lists_entries.type(), detail::has_negative_nans_fn{}, lists_entries, stream);
+      nans_equal == nan_equality::ALL_EQUAL &&
+      type_dispatcher(
+        lists_entries.type(), detail::has_negative_nans_dispatch{}, lists_entries, stream);
     if (replace_negative_nan) {
-      // The column new_lists_column is temporary, thus we will not pass in `mr`
       auto const new_lists_column =
         detail::replace_negative_nans_entries(lists_entries, lists_column, stream);
       return detail::sort_lists(
@@ -490,28 +658,28 @@ std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_colu
   auto const sorted_lists_entries =
     lists_column_view(sorted_lists->view()).get_sliced_child(stream);
 
-  // Generate a 0-based offset column
+  // Generate a 0-based offset column.
   auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr);
 
-  // Generate a mapping from list entries to offsets of the lists containing those entries
+  // Generate a mapping from list entries to offsets of the lists containing those entries.
   auto const entries_list_offsets =
     detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream);
 
-  // Copy non-duplicated entries (along with their list offsets) to new arrays
+  // Copy non-duplicated entries (along with their list offsets) to new arrays.
   auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets(
     sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr);
 
-  // Generate offsets for the new lists column
+  // Generate offsets for the new lists column.
   detail::generate_offsets(unique_entries_and_list_offsets.front()->size(),
                            unique_entries_and_list_offsets.back()->view(),
                            lists_offsets->mutable_view(),
                            stream);
 
-  // Construct a new lists column without duplicated entries
+  // Construct a new lists column without duplicated entries.
   // Reuse the null_count and bitmask of the lists_column: those are the null information for
-  // the list elements (rows)
+  // the list elements (rows).
   // For the entries of those lists (rows), their null_count and bitmask were generated separately
-  // during the step `get_unique_entries_and_list_offsets` above
+  // during the step `get_unique_entries_and_list_offsets` above.
   return make_lists_column(lists_column.size(),
                            std::move(lists_offsets),
                            std::move(unique_entries_and_list_offsets.front()),
diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
index bc413fd220a..270e01075b9 100644
--- a/cpp/tests/lists/drop_list_duplicates_tests.cpp
+++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp
@@ -14,61 +14,65 @@
  * limitations under the License.
  */
 
+#include <cudf/copying.hpp>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/drop_list_duplicates.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <algorithm>
 #include <unordered_set>
 
-using int_type   = int32_t;
-using float_type = float;
-
-using LIST_COL_FLT = cudf::test::lists_column_wrapper<float_type>;
-using LIST_COL_STR = cudf::test::lists_column_wrapper<cudf::string_view>;
+using namespace cudf::test::iterators;
 
-auto constexpr neg_NaN = -std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr neg_Inf = -std::numeric_limits<float_type>::infinity();
-auto constexpr NaN     = std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr Inf     = std::numeric_limits<float_type>::infinity();
+using float_type    = float;
+using FloatListsCol = cudf::test::lists_column_wrapper<float_type>;
+using StrListsCol   = cudf::test::lists_column_wrapper<cudf::string_view>;
+using StringsCol    = cudf::test::strings_column_wrapper;
+using StructsCol    = cudf::test::structs_column_wrapper;
+using IntsCol       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using FloatsCol     = cudf::test::fixed_width_column_wrapper<float_type>;
 
-template <class LCW>
-void test_once(cudf::column_view const& input,
-               LCW const& expected,
-               cudf::null_equality nulls_equal = cudf::null_equality::EQUAL)
-{
-  auto const results =
-    cudf::lists::drop_list_duplicates(cudf::lists_column_view{input}, nulls_equal);
-  if (cudf::is_floating_point(input.type())) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
-  } else {
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
-  }
-}
+auto constexpr neg_NaN   = -std::numeric_limits<float_type>::quiet_NaN();
+auto constexpr neg_Inf   = -std::numeric_limits<float_type>::infinity();
+auto constexpr NaN       = std::numeric_limits<float_type>::quiet_NaN();
+auto constexpr Inf       = std::numeric_limits<float_type>::infinity();
+auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR;
 
 struct DropListDuplicatesTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero)
 {
-  // -0.0 and 0.0 should be considered equal
-  test_once(LIST_COL_FLT{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0},
-            LIST_COL_FLT{0, 1, 2});
+  // -0.0 and 0.0 should be considered equal.
+  auto const lists    = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0};
+  auto const expected = FloatListsCol{0, 1, 2};
+  auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
 }
 
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf)
 {
-  // Lists contain inf
-  test_once(LIST_COL_FLT{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf}, LIST_COL_FLT{0, 1, 2, Inf});
-  test_once(LIST_COL_FLT{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf},
-            LIST_COL_FLT{neg_Inf, 0, Inf});
+  // Lists contain inf.
+  {
+    auto const lists    = FloatListsCol{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf};
+    auto const expected = FloatListsCol{0, 1, 2, Inf};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+  {
+    auto const lists    = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf};
+    auto const expected = FloatListsCol{neg_Inf, 0, Inf};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to
-// check for validity
+// check for validity.
 // We will not store NaN in the results_expected variable (an unordered_set) because we can't check
 // for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare
 // with the number of NaNs in the output.
@@ -77,14 +81,14 @@ static void test_floating_point(std::vector<float_type> const& h_input,
                                 cudf::nan_equality nans_equal)
 {
   // If NaNs are considered as equal value, the final result should always contain at max ONE NaN
-  // entry per list
+  // entry per list.
   std::size_t const num_NaNs =
     nans_equal == cudf::nan_equality::ALL_EQUAL
       ? std::size_t{1}
       : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); });
 
   auto const results_col = cudf::lists::drop_list_duplicates(
-    cudf::lists_column_view{LIST_COL_FLT(h_input.begin(), h_input.end())},
+    cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())},
     cudf::null_equality::EQUAL,
     nans_equal);
   auto const results_arr =
@@ -125,130 +129,479 @@ TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs)
 
 TEST_F(DropListDuplicatesTest, StringTestsNonNull)
 {
-  // Trivial cases
-  test_once(LIST_COL_STR{{}}, LIST_COL_STR{{}});
-  test_once(LIST_COL_STR{"this", "is", "a", "string"}, LIST_COL_STR{"a", "is", "string", "this"});
-
-  // One list column
-  test_once(LIST_COL_STR{"this", "is", "is", "is", "a", "string", "string"},
-            LIST_COL_STR{"a", "is", "string", "this"});
-
-  // Multiple lists column
-  test_once(
-    LIST_COL_STR{LIST_COL_STR{"this", "is", "a", "no duplicate", "string"},
-                 LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"},
-                 LIST_COL_STR{"this", "is", "is", "is", "a", "two duplicates", "string"},
-                 LIST_COL_STR{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}},
-    LIST_COL_STR{LIST_COL_STR{"a", "is", "no duplicate", "string", "this"},
-                 LIST_COL_STR{"a", "is", "one duplicate", "string", "this"},
-                 LIST_COL_STR{"a", "is", "string", "this", "two duplicates"},
-                 LIST_COL_STR{"a", "is", "string", "this", "three duplicates"}});
+  // Trivial cases - empty input.
+  {
+    auto const lists    = StrListsCol{{}};
+    auto const expected = StrListsCol{{}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // No duplicate entry.
+  {
+    auto const lists    = StrListsCol{"this", "is", "a", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // One list column.
+  {
+    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // One list column, input is a strings column with given non-default null_equality and
+  // nans_equality parameters.
+  {
+    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
+    auto const expected = StrListsCol{"a", "is", "string", "this"};
+    auto const results  = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // Multiple lists column.
+  {
+    auto const lists =
+      StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"},
+                  StrListsCol{"this", "is", "is", "a", "one duplicate", "string"},
+                  StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"},
+                  StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}};
+    auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"},
+                                      StrListsCol{"a", "is", "one duplicate", "string", "this"},
+                                      StrListsCol{"a", "is", "string", "this", "two duplicates"},
+                                      StrListsCol{"a", "is", "string", "this", "three duplicates"}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TEST_F(DropListDuplicatesTest, StringTestsWithNulls)
 {
   auto const null = std::string("");
 
-  // One list column with null entries
-  test_once(
-    LIST_COL_STR{{"this", null, "is", "is", "is", "a", null, "string", null, "string"},
-                 cudf::detail::make_counting_transform_iterator(
-                   0, [](auto i) { return i != 1 && i != 6 && i != 8; })},
-    LIST_COL_STR{{"a", "is", "string", "this", null},
-                 cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; })});
+  // One list column with null entries.
+  {
+    auto const lists = StrListsCol{
+      {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})};
+    auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
   // Multiple lists column with null lists and null entries
-  test_once(
-    LIST_COL_STR{
-      {LIST_COL_STR{
-         {"this", null, "is", null, "a", null, "no duplicate", null, "string"},
-         cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; })},
-       LIST_COL_STR{},
-       LIST_COL_STR{"this", "is", "is", "a", "one duplicate", "string"}},
-      cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })},
-    LIST_COL_STR{{LIST_COL_STR{{"a", "is", "no duplicate", "string", "this", null},
-                               cudf::detail::make_counting_transform_iterator(
-                                 0, [](auto i) { return i <= 4; })},
-                  LIST_COL_STR{},
-                  LIST_COL_STR{"a", "is", "one duplicate", "string", "this"}},
-                 cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; })});
+  {
+    auto const lists = StrListsCol{
+      {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"},
+                   nulls_at({1, 3, 5, 7})},
+       StrListsCol{}, /* NULL */
+       StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}},
+      null_at(1)};
+    auto const expected =
+      StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)},
+                   StrListsCol{}, /* NULL */
+                   StrListsCol{"a", "is", "one duplicate", "string", "this"}},
+                  null_at(1)};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 template <typename T>
 struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture {
 };
-#define LIST_COL cudf::test::lists_column_wrapper<TypeParam>
 
 using TypesForTest =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
-TYPED_TEST_CASE(DropListDuplicatesTypedTest, TypesForTest);
+TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest);
 
 TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests)
 {
-  // Lists of nested types are not supported
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Nested types (except struct) are not supported.
   EXPECT_THROW(
-    cudf::lists::drop_list_duplicates(cudf::lists_column_view{LIST_COL{LIST_COL{{1, 2}, {3}}}}),
+    cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}),
     cudf::logic_error);
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
 {
-  // Empty input
-  test_once(LIST_COL{{}}, LIST_COL{{}});
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Empty input.
+  {
+    auto const lists    = ListsCol{{}};
+    auto const expected = ListsCol{{}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
-  // Trivial cases
-  test_once(LIST_COL{0, 1, 2, 3, 4, 5}, LIST_COL{0, 1, 2, 3, 4, 5});
+  // Trivial cases.
+  {
+    auto const lists    = ListsCol{0, 1, 2, 3, 4, 5};
+    auto const expected = ListsCol{0, 1, 2, 3, 4, 5};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 
-  // Multiple empty lists
-  test_once(LIST_COL{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}},
-            LIST_COL{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}});
+  // Multiple empty lists.
+  {
+    auto const lists    = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}};
+    auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests)
 {
-  // Adjacent lists containing the same entries
-  test_once(LIST_COL{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}},
-            LIST_COL{{1}, {1, 2}, {2, 3}});
-
-  // Sliced list column
-  auto const list0 =
-    LIST_COL{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
-  auto const list1 = cudf::slice(list0, {0, 5})[0];
-  auto const list2 = cudf::slice(list0, {1, 5})[0];
-  auto const list3 = cudf::slice(list0, {1, 3})[0];
-  auto const list4 = cudf::slice(list0, {0, 3})[0];
-
-  test_once(list0, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list1, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list2, LIST_COL{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}});
-  test_once(list3, LIST_COL{{1, 2, 3, 4}, {5}});
-  test_once(list4, LIST_COL{{1, 2, 3}, {1, 2, 3, 4}, {5}});
+  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
+
+  // Adjacent lists containing the same entries.
+  {
+    auto const lists =
+      ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}};
+    auto const expected = ListsCol{{1}, {1, 2}, {2, 3}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // Sliced list column.
+  auto const lists_original =
+    ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
+  auto const lists1 = cudf::slice(lists_original, {0, 5})[0];
+  auto const lists2 = cudf::slice(lists_original, {1, 5})[0];
+  auto const lists3 = cudf::slice(lists_original, {1, 3})[0];
+  auto const lists4 = cudf::slice(lists_original, {0, 3})[0];
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3, 4}, {5}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  {
+    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}};
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
 }
 
 TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests)
 {
+  using ListsCol      = cudf::test::lists_column_wrapper<TypeParam>;
   auto constexpr null = TypeParam{0};
 
-  // null lists
-  test_once(LIST_COL{{{3, 2, 1, 4, 1}, {5}, {}, {}, {10, 8, 9}, {6, 7}},
-                     cudf::detail::make_counting_transform_iterator(
-                       0, [](auto i) { return i != 2 && i != 3; })},
-            LIST_COL{{{1, 2, 3, 4}, {5}, {}, {}, {8, 9, 10}, {6, 7}},
-                     cudf::detail::make_counting_transform_iterator(
-                       0, [](auto i) { return i != 2 && i != 3; })});
-
-  // null entries are equal
-  test_once(
-    LIST_COL{std::initializer_list<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
-    LIST_COL{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })});
-
-  // nulls entries are not equal
-  test_once(
-    LIST_COL{std::initializer_list<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; })},
-    LIST_COL{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null, null, null, null, null},
-             cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; })},
-    cudf::null_equality::UNEQUAL);
+  // null lists.
+  {
+    auto const lists = ListsCol{
+      {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})};
+    auto const expected =
+      ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // null entries are equal.
+  {
+    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
+    auto const expected =
+      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null}, null_at(5)};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+
+  // nulls entries are not equal.
+  {
+    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
+    auto const expected =
+      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null, null, null, null, null},
+               nulls_at({5, 6, 7, 8, 9})};
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists},
+                                                           cudf::null_equality::UNEQUAL);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull)
+{
+  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  auto const get_structs = [] {
+    auto child1 = ColWrapper{
+      1, 1, 1, 1, 1, 1, 1, 1,  // list1
+      1, 1, 1, 1, 2, 1, 2, 2,  // list2
+      2, 2, 2, 2, 3, 2, 3, 3   // list3
+    };
+    auto child2 = StringsCol{
+      // begin list1
+      "Banana",
+      "Mango",
+      "Apple",
+      "Cherry",
+      "Kiwi",
+      "Banana",
+      "Cherry",
+      "Kiwi",  // end list1
+      // begin list2
+      "Bear",
+      "Duck",
+      "Cat",
+      "Dog",
+      "Panda",
+      "Bear",
+      "Cat",
+      "Panda",  // end list2
+      // begin list3
+      "ÁÁÁ",
+      "ÉÉÉÉÉ",
+      "ÍÍÍÍÍ",
+      "ÁBC",
+      "XYZ",
+      "ÁÁÁ",
+      "ÁBC",
+      "XYZ"  // end list3
+    };
+    return StructsCol{{child1, child2}};
+  };
+
+  auto const get_structs_expected = [] {
+    auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3};
+    auto child2 = StringsCol{
+      // begin list1
+      "Apple",
+      "Banana",
+      "Cherry",
+      "Kiwi",
+      "Mango",  // end list1
+      // begin list2
+      "Bear",
+      "Cat",
+      "Dog",
+      "Duck",
+      "Cat",
+      "Panda",  // end list2
+      // begin list3
+      "ÁBC",
+      "ÁÁÁ",
+      "ÉÉÉÉÉ",
+      "ÍÍÍÍÍ",
+      "XYZ",
+      "ÁBC"  // end list3
+    };
+    return StructsCol{{child1, child2}};
+  };
+
+  // Test full columns.
+  {
+    auto const lists =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected = cudf::make_lists_column(
+      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
+  }
+
+  // Test sliced columns.
+  {
+    auto const lists_original =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected_original = cudf::make_lists_column(
+      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
+    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
+    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull)
+{
+  using ColWrapper    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  auto constexpr XXX  = int32_t{0};  // nulls at the parent structs column level
+  auto constexpr null = int32_t{0};  // nulls at the children columns level
+
+  auto const get_structs = [] {
+    auto child1 = ColWrapper{{
+                               1,    1,    null, XXX, XXX, 1, 1,    1,  // list1
+                               1,    1,    1,    1,   2,   1, null, 2,  // list2
+                               null, null, 2,    2,   3,   2, 3,    3   // list3
+                             },
+                             nulls_at({2, 14, 16, 17})};
+    auto child2 = StringsCol{{
+                               // begin list1
+                               "Banana",
+                               "Mango",
+                               "Apple",
+                               "XXX", /*NULL*/
+                               "XXX", /*NULL*/
+                               "Banana",
+                               "Cherry",
+                               "Kiwi",  // end list1
+                                        // begin list2
+                               "Bear",
+                               "Duck",
+                               "Cat",
+                               "Dog",
+                               "Panda",
+                               "Bear",
+                               "" /*NULL*/,
+                               "Panda",  // end list2
+                                         // begin list3
+                               "ÁÁÁ",
+                               "ÉÉÉÉÉ",
+                               "ÍÍÍÍÍ",
+                               "ÁBC",
+                               "" /*NULL*/,
+                               "ÁÁÁ",
+                               "ÁBC",
+                               "XYZ"  // end list3
+                             },
+                             nulls_at({14, 20})};
+    return StructsCol{{child1, child2}, nulls_at({3, 4})};
+  };
+
+  auto const get_structs_expected = [] {
+    auto child1 =
+      ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
+                 nulls_at({4, 5, 11, 18, 19})};
+    auto child2 = StringsCol{{
+                               // begin list1
+                               "Banana",
+                               "Cherry",
+                               "Kiwi",
+                               "Mango",
+                               "Apple",
+                               "XXX" /*NULL*/,  // end list1
+                                                // begin list2
+                               "Bear",
+                               "Cat",
+                               "Dog",
+                               "Duck",
+                               "Panda",
+                               "" /*NULL*/,  // end list2
+                                             // begin list3
+                               "ÁBC",
+                               "ÁÁÁ",
+                               "ÍÍÍÍÍ",
+                               "XYZ",
+                               "ÁBC",
+                               "" /*NULL*/,
+                               "ÁÁÁ",
+                               "ÉÉÉÉÉ"  // end list3
+                             },
+                             nulls_at({5, 11, 17})};
+    return StructsCol{{child1, child2}, null_at(5)};
+  };
+
+  // Test full columns.
+  {
+    auto const lists =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected = cudf::make_lists_column(
+      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
+  }
+
+  // Test sliced columns.
+  {
+    auto const lists_original =
+      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
+    auto const expected_original = cudf::make_lists_column(
+      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
+    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
+    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
+    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  }
+}
+
+TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
+{
+  auto const h_child = std::vector<float_type>{
+    0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN};
+
+  auto const get_structs = [&] {
+    // Two children are just identical.
+    auto child1 = FloatsCol(h_child.begin(), h_child.end());
+    auto child2 = FloatsCol(h_child.begin(), h_child.end());
+    return StructsCol{{child1, child2}};
+  };
+
+  // The first list does not have any NaN or -NaN, while the second list has both.
+  // `drop_list_duplicates` is expected to operate properly on this second list.
+  auto const lists_original =
+    cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {});
+  auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0];  // test on the second list
+
+  // Contain expected values excluding NaN.
+  auto const results_children_expected = std::unordered_set<float_type>{0, 1, 2};
+
+  // Test for cudf::nan_equality::UNEQUAL.
+  {
+    auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
+    auto const child       = cudf::lists_column_view(results_col->view()).child();
+    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
+
+    std::size_t const num_NaNs =
+      std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); });
+    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
+
+    std::size_t NaN_count{0};
+    std::unordered_set<float_type> results;
+    for (auto const x : results_arr) {
+      if (std::isnan(x)) {
+        ++NaN_count;
+      } else {
+        results.insert(x);
+      }
+    }
+    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
+  }
+
+  // Test for cudf::nan_equality::ALL_EQUAL.
+  {
+    auto const results_col = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
+    auto const child       = cudf::lists_column_view(results_col->view()).child();
+    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
+
+    std::size_t const num_NaNs = 1;
+    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
+
+    std::size_t NaN_count{0};
+    std::unordered_set<float_type> results;
+    for (auto const x : results_arr) {
+      if (std::isnan(x)) {
+        ++NaN_count;
+      } else {
+        results.insert(x);
+      }
+    }
+    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
+  }
 }
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index c26059ee09b..5631c910753 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -2168,34 +2168,45 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
                                       result_with_nan_equal->view());
 }
 
-TEST_F(CollectSetTest, ListTypeRollingWindow)
+TEST_F(CollectSetTest, StructTypeRollingWindow)
 {
   using namespace cudf;
   using namespace cudf::test;
 
-  auto const input_column = lists_column_wrapper<int32_t>{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}};
-
-  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
-  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+  auto col1               = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
+  auto col2               = strings_column_wrapper{"a", "b", "c", "d", "e"};
+  auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}};
+  auto const prev_column  = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column  = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
 
-  EXPECT_THROW(rolling_window(input_column,
-                              prev_column,
-                              foll_column,
-                              1,
-                              *make_collect_set_aggregation<rolling_aggregation>()),
-               cudf::logic_error);
+  auto const expected = [] {
+    auto child1 = fixed_width_column_wrapper<int32_t>{1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5};
+    auto child2 =
+      strings_column_wrapper{"a", "b", "a", "b", "c", "b", "c", "d", "c", "d", "e", "d", "e"};
+    return cudf::make_lists_column(
+      5,
+      fixed_width_column_wrapper<size_type>{0, 2, 5, 8, 11, 13}.release(),
+      structs_column_wrapper{{child1, child2}}.release(),
+      0,
+      {});
+  }();
+  auto const result = rolling_window(input_column,
+                                     prev_column,
+                                     foll_column,
+                                     1,
+                                     *make_collect_set_aggregation<rolling_aggregation>());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected->view(), result->view());
 }
 
-TEST_F(CollectSetTest, StructTypeRollingWindow)
+TEST_F(CollectSetTest, ListTypeRollingWindow)
 {
   using namespace cudf;
   using namespace cudf::test;
 
-  auto col1               = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
-  auto col2               = strings_column_wrapper{"a", "b", "c", "d", "e"};
-  auto const input_column = cudf::test::structs_column_wrapper{{col1, col2}};
-  auto const prev_column  = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
-  auto const foll_column  = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
+  auto const input_column = lists_column_wrapper<int32_t>{{1, 2, 3}, {4, 5}, {6}, {7, 8, 9}, {10}};
+
+  auto const prev_column = fixed_width_column_wrapper<size_type>{1, 2, 2, 2, 2};
+  auto const foll_column = fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 0};
 
   EXPECT_THROW(rolling_window(input_column,
                               prev_column,

From 4ac54e1f44d5b7d8810a13848b105443c3070ca7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 21 Sep 2021 18:57:30 -0700
Subject: [PATCH 07/26] Various internal MultiIndex improvements (#9243)

This PR is a follow-up to #9191 to further simplify the MultiIndex class. It removes various unused functions, inlines and simplifies a number of other single-use internal functions, and moves more methods to Frame that can be shared. It also makes numerous other miscellaneous improvements to the code in MultiIndex to simplify further rewrites.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Marlene  (https://github.com/marlenezw)
  - H. Thomson Comer (https://github.com/thomcom)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9243
---
 python/cudf/cudf/core/_base_index.py      |  37 +++
 python/cudf/cudf/core/column/string.py    |   4 +-
 python/cudf/cudf/core/dataframe.py        |  12 -
 python/cudf/cudf/core/frame.py            |  20 +-
 python/cudf/cudf/core/index.py            |  19 --
 python/cudf/cudf/core/multiindex.py       | 306 ++++++----------------
 python/cudf/cudf/tests/test_multiindex.py |  17 ++
 7 files changed, 158 insertions(+), 257 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 1fe59d3dfd6..b2f3274faab 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -44,6 +44,11 @@ def _values(self) -> ColumnBase:
     def copy(self, deep: bool = True) -> BaseIndex:
         raise NotImplementedError
 
+    @property
+    def size(self):
+        # The size of an index is always its length irrespective of dimension.
+        return len(self)
+
     @property
     def values(self):
         return self._values.values
@@ -162,6 +167,38 @@ def _clean_nulls_from_index(self):
         else:
             return self
 
+    @property
+    def is_monotonic(self):
+        """Return boolean if values in the object are monotonic_increasing.
+
+        This property is an alias for :attr:`is_monotonic_increasing`.
+
+        Returns
+        -------
+        bool
+        """
+        return self.is_monotonic_increasing
+
+    @property
+    def is_monotonic_increasing(self):
+        """Return boolean if values in the object are monotonically increasing.
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
+    @property
+    def is_monotonic_decreasing(self):
+        """Return boolean if values in the object are monotonically decreasing.
+
+        Returns
+        -------
+        bool
+        """
+        raise NotImplementedError
+
     @property
     def nlevels(self):
         """
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c14cbd11714..c59081e4b59 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -352,7 +352,9 @@ def cat(self, others=None, sep=None, na_rep=None):
 
         if len(data) == 1 and data.null_count == 1:
             data = [""]
-        out = self._return_or_inplace(data)
+        # We only want to keep the index if we are adding something to each
+        # row, not if we are joining all the rows into a single string.
+        out = self._return_or_inplace(data, retain_index=others is not None)
         if len(out) == 1 and others is None:
             if isinstance(out, cudf.Series):
                 out = out.iloc[0]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b8fe4fcaff6..a0811f33351 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -594,12 +594,6 @@ def dtypes(self):
             data=[x.dtype for x in self._data.columns], index=self._data.names,
         )
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the DataFrame.
-        """
-        return self._num_rows, self._num_columns
-
     @property
     def ndim(self):
         """Dimension of the data. DataFrame ndim is always 2.
@@ -938,12 +932,6 @@ def memory_usage(self, index=True, deep=False):
             sizes.append(self.index.memory_usage(deep=deep))
         return Series(sizes, index=ind)
 
-    def __len__(self):
-        """
-        Returns the number of rows
-        """
-        return len(self.index)
-
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         import cudf
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 64b96458218..0809e14a8a2 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -166,6 +166,11 @@ def size(self):
         """
         return self._num_columns * self._num_rows
 
+    @property
+    def shape(self):
+        """Returns a tuple representing the dimensionality of the DataFrame."""
+        return self._num_rows, self._num_columns
+
     @property
     def _is_homogeneous(self):
         # make sure that the dataframe has columns
@@ -4458,6 +4463,12 @@ def to_string(self):
     def __str__(self):
         return self.to_string()
 
+    def __deepcopy__(self, memo):
+        return self.copy(deep=True)
+
+    def __copy__(self):
+        return self.copy(deep=False)
+
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -4726,9 +4737,6 @@ def __iter__(self):
         """
         cudf.utils.utils.raise_iteration_error(obj=self)
 
-    def __len__(self):
-        return len(self._column)
-
     def __bool__(self):
         raise TypeError(
             f"The truth value of a {type(self)} is ambiguous. Use "
@@ -4916,7 +4924,7 @@ def is_unique(self):
 
     @property
     def is_monotonic(self):
-        """Return boolean if values in the object are monotonic_increasing.
+        """Return boolean if values in the object are monotonically increasing.
 
         This property is an alias for :attr:`is_monotonic_increasing`.
 
@@ -4928,7 +4936,7 @@ def is_monotonic(self):
 
     @property
     def is_monotonic_increasing(self):
-        """Return boolean if values in the object are monotonic_increasing.
+        """Return boolean if values in the object are monotonically increasing.
 
         Returns
         -------
@@ -4938,7 +4946,7 @@ def is_monotonic_increasing(self):
 
     @property
     def is_monotonic_decreasing(self):
-        """Return boolean if values in the object are monotonic_decreasing.
+        """Return boolean if values in the object are monotonically decreasing.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3ac30143463..6414d4a7e84 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -349,17 +349,6 @@ def dtype(self):
         """
         return cudf.dtype(np.int64)
 
-    @property
-    def is_contiguous(self):
-        """
-        Returns if the index is contiguous.
-        """
-        return self._step == 1
-
-    @property
-    def size(self):
-        return len(self)
-
     def find_label_range(self, first=None, last=None):
         """Find subrange in the ``RangeIndex``, marked by their positions, that
         starts greater or equal to ``first`` and ends less or equal to ``last``
@@ -417,18 +406,10 @@ def is_unique(self):
 
     @property
     def is_monotonic_increasing(self):
-        """
-        Return if the index is monotonic increasing
-        (only equal or increasing) values.
-        """
         return self._step > 0 or len(self) <= 1
 
     @property
     def is_monotonic_decreasing(self):
-        """
-        Return if the index is monotonic decreasing
-        (only equal or decreasing) values.
-        """
         return self._step < 0 or len(self) <= 1
 
     def get_slice_bound(self, label, side, kind=None):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 84566b4627c..3bf5f70be39 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -7,6 +7,7 @@
 import pickle
 import warnings
 from collections.abc import Sequence
+from numbers import Integral
 from typing import Any, List, MutableMapping, Optional, Tuple, Union
 
 import cupy
@@ -17,6 +18,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import DataFrameOrSeries
+from cudf.api.types import is_integer, is_list_like
 from cudf.core import column
 from cudf.core._compat import PANDAS_GE_120
 from cudf.core.frame import Frame
@@ -33,8 +35,6 @@ class MultiIndex(Frame, BaseIndex):
     ----------
     levels : sequence of arrays
         The unique labels for each level.
-    labels : sequence of arrays
-        labels is depreciated, please use levels
     codes: sequence of arrays
         Integers for each level designating which label at each location.
     sortorder : optional int
@@ -68,7 +68,6 @@ def __init__(
         levels=None,
         codes=None,
         sortorder=None,
-        labels=None,
         names=None,
         dtype=None,
         copy=False,
@@ -78,13 +77,16 @@ def __init__(
 
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")
-
         if name is not None:
             raise NotImplementedError(
                 "Use `names`, `name` is not yet supported"
             )
-
-        super().__init__()
+        if len(levels) == 0:
+            raise ValueError("Must pass non-zero number of levels/codes")
+        if not isinstance(codes, cudf.DataFrame) and not isinstance(
+            codes[0], (Sequence, np.ndarray)
+        ):
+            raise TypeError("Codes is not a Sequence of sequences")
 
         if copy:
             if isinstance(codes, cudf.DataFrame):
@@ -92,58 +94,57 @@ def __init__(
             if len(levels) > 0 and isinstance(levels[0], cudf.Series):
                 levels = [level.copy(deep=True) for level in levels]
 
-        self._name = None
-
-        if labels:
-            warnings.warn(
-                "the 'labels' keyword is deprecated, use 'codes' " "instead",
-                FutureWarning,
-            )
-        if labels and not codes:
-            codes = labels
-
-        if len(levels) == 0:
-            raise ValueError("Must pass non-zero number of levels/codes")
+        if not isinstance(codes, cudf.DataFrame):
+            if len(levels) == len(codes):
+                codes = cudf.DataFrame._from_data(
+                    {
+                        i: column.as_column(code).astype(np.int64)
+                        for i, code in enumerate(codes)
+                    }
+                )
+            else:
+                raise ValueError(
+                    "MultiIndex has unequal number of levels and "
+                    "codes and is inconsistent!"
+                )
 
-        if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (Sequence, np.ndarray)
-        ):
-            raise TypeError("Codes is not a Sequence of sequences")
+        levels = [cudf.Series(level) for level in levels]
 
-        if isinstance(codes, cudf.DataFrame):
-            self._codes = codes
-        elif len(levels) == len(codes):
-            self._codes = cudf.DataFrame._from_data(
-                {
-                    i: column.as_column(code).astype(np.int64)
-                    for i, code in enumerate(codes)
-                }
-            )
-        else:
+        if len(levels) != len(codes.columns):
             raise ValueError(
                 "MultiIndex has unequal number of levels and "
                 "codes and is inconsistent!"
             )
+        if len(set(c.size for c in codes._data.columns)) != 1:
+            raise ValueError(
+                "MultiIndex length of codes does not match "
+                "and is inconsistent!"
+            )
+        for level, code in zip(levels, codes._data.columns):
+            if code.max() > len(level) - 1:
+                raise ValueError(
+                    "MultiIndex code %d contains value %d larger "
+                    "than maximum level size at this position"
+                )
 
-        self._levels = [cudf.Series(level) for level in levels]
-        self._validate_levels_and_codes(self._levels, self._codes)
-
-        source_data = cudf.DataFrame()
-        for i, n in enumerate(self._codes.columns):
-            codes = as_index(self._codes[n]._column)
-            if -1 in self._codes[n].values:
+        source_data = {}
+        for i, (column_name, col) in enumerate(codes._data.items()):
+            if -1 in col.values:
                 level = cudf.DataFrame(
-                    {n: [None] + list(self._levels[i])},
-                    index=range(-1, len(self._levels[i])),
+                    {column_name: [None] + list(levels[i])},
+                    index=range(-1, len(levels[i])),
                 )
             else:
-                level = cudf.DataFrame({n: self._levels[i]})
+                level = cudf.DataFrame({column_name: levels[i]})
 
-            source_data[n] = libcudf.copying.gather(
-                level, codes._data.columns[0]
-            )[0][n]
+            source_data[column_name] = libcudf.copying.gather(level, col)[0][
+                column_name
+            ]
 
-        self._data = source_data._data
+        super().__init__(source_data)
+        self._levels = levels
+        self._codes = codes
+        self._name = None
         self.names = names
 
     @property
@@ -153,7 +154,6 @@ def names(self):
     @names.setter
     def names(self, value):
         value = [None] * self.nlevels if value is None else value
-        assert len(value) == self.nlevels
 
         if len(value) == len(set(value)):
             # IMPORTANT: if the provided names are unique,
@@ -216,25 +216,20 @@ def rename(self, names, inplace=False):
         return self.set_names(names, level=None, inplace=inplace)
 
     def set_names(self, names, level=None, inplace=False):
-        if (
-            level is not None
-            and not cudf.api.types.is_list_like(level)
-            and cudf.api.types.is_list_like(names)
-        ):
+        names_is_list_like = is_list_like(names)
+        level_is_list_like = is_list_like(level)
+
+        if level is not None and not level_is_list_like and names_is_list_like:
             raise TypeError(
                 "Names must be a string when a single level is provided."
             )
 
-        if (
-            not cudf.api.types.is_list_like(names)
-            and level is None
-            and self.nlevels > 1
-        ):
+        if not names_is_list_like and level is None and self.nlevels > 1:
             raise TypeError("Must pass list-like as `names`.")
 
-        if not cudf.api.types.is_list_like(names):
+        if not names_is_list_like:
             names = [names]
-        if level is not None and not cudf.api.types.is_list_like(level):
+        if level is not None and not level_is_list_like:
             level = [level]
 
         if level is not None and len(names) != len(level):
@@ -269,10 +264,6 @@ def _from_data(
             obj.name = name
         return obj
 
-    @property
-    def shape(self):
-        return (self._data.nrows, len(self._data.names))
-
     @property
     def name(self):
         return self._name
@@ -281,26 +272,6 @@ def name(self):
     def name(self, value):
         self._name = value
 
-    def _validate_levels_and_codes(self, levels, codes):
-        if len(levels) != len(codes.columns):
-            raise ValueError(
-                "MultiIndex has unequal number of levels and "
-                "codes and is inconsistent!"
-            )
-        code_length = len(codes[codes.columns[0]])
-        for index, code in enumerate(codes):
-            if code_length != len(codes[code]):
-                raise ValueError(
-                    "MultiIndex length of codes does not match "
-                    "and is inconsistent!"
-                )
-        for index, code in enumerate(codes):
-            if codes[code].max() > len(levels[index]) - 1:
-                raise ValueError(
-                    "MultiIndex code %d contains value %d larger "
-                    "than maximum level size at this position"
-                )
-
     def copy(
         self,
         names=None,
@@ -396,36 +367,9 @@ def copy(
 
         return mi
 
-    def deepcopy(self):
-        return self.copy(deep=True)
-
-    def __copy__(self):
-        return self.copy(deep=True)
-
     def __iter__(self):
-        """
-        Iterating over a GPU object is not effecient and hence not supported.
-
-        Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host``
-        if you wish to iterate over the values.
-        """
         cudf.utils.utils.raise_iteration_error(obj=self)
 
-    def _popn(self, n):
-        """ Returns a copy of this index without the left-most n values.
-
-        Removes n names, labels, and codes in order to build a new index
-        for results.
-        """
-        result = MultiIndex(
-            levels=self.levels[n:],
-            codes=self.codes.iloc[:, n:],
-            names=self.names[n:],
-        )
-        if self.names is not None:
-            result.names = self.names[n:]
-        return result
-
     def __repr__(self):
         max_seq_items = get_option("display.max_seq_items") or len(self)
 
@@ -534,9 +478,7 @@ def codes(self):
 
     @property
     def nlevels(self):
-        """
-        Integer number of levels in this MultiIndex.
-        """
+        """Integer number of levels in this MultiIndex."""
         return len(self._data)
 
     @property
@@ -576,23 +518,13 @@ def levels(self):
             self._compute_levels_and_codes()
         return self._levels
 
-    @property
-    def labels(self):
-        warnings.warn(
-            "This feature is deprecated in pandas and will be"
-            "dropped from cudf as well.",
-            FutureWarning,
-        )
-        return self.codes
-
     @property
     def ndim(self):
-        """Dimension of the data. For MultiIndex ndim is always 2.
-        """
+        """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
     def _get_level_label(self, level):
-        """ Get name of the level.
+        """Get name of the level.
 
         Parameters
         ----------
@@ -658,8 +590,6 @@ def isin(self, values, level=None):
         >>> midx.isin([(1, 'red'), (3, 'red')])
         array([ True, False, False])
         """
-        from cudf.api.types import is_list_like
-
         if level is None:
             if isinstance(values, cudf.MultiIndex):
                 values_idx = values
@@ -708,11 +638,6 @@ def isin(self, values, level=None):
 
         return result
 
-    def mask(self, cond, other=None, inplace=False):
-        raise NotImplementedError(
-            ".mask is not supported for MultiIndex operations"
-        )
-
     def where(self, cond, other=None, inplace=False):
         raise NotImplementedError(
             ".where is not supported for MultiIndex operations"
@@ -795,9 +720,7 @@ def _index_and_downcast(self, result, index, index_key):
         ) or isinstance(index_key[0], slice):
             index_key = index_key[0]
 
-        slice_access = False
-        if isinstance(index_key, slice):
-            slice_access = True
+        slice_access = isinstance(index_key, slice)
         out_index = cudf.DataFrame()
         # Select the last n-k columns where n is the number of columns and k is
         # the length of the indexing tuple
@@ -805,30 +728,24 @@ def _index_and_downcast(self, result, index, index_key):
         if not isinstance(index_key, (numbers.Number, slice)):
             size = len(index_key)
         for k in range(size, len(index._data)):
-            if index.names is None:
-                name = k
-            else:
-                name = index.names[k]
             out_index.insert(
-                len(out_index.columns),
-                name,
+                out_index._num_columns,
+                k if index.names is None else index.names[k],
                 cudf.Series._from_data({None: index._data.columns[k]}),
             )
 
-        if len(result) == 1 and size == 0 and slice_access is False:
+        if len(result) == 1 and size == 0 and not slice_access:
             # If the final result is one row and it was not mapped into
             # directly, return a Series with a tuple as name.
             result = result.T
             result = result[result._data.names[0]]
-        elif len(result) == 0 and slice_access is False:
+        elif len(result) == 0 and not slice_access:
             # Pandas returns an empty Series with a tuple as name
             # the one expected result column
-            series_name = []
-            for col in index._data.columns:
-                series_name.append(col[0])
-            result = cudf.Series([])
-            result.name = tuple(series_name)
-        elif len(out_index.columns) == 1:
+            result = cudf.Series._from_data(
+                {}, name=tuple((col[0] for col in index._data.columns))
+            )
+        elif out_index._num_columns == 1:
             # If there's only one column remaining in the output index, convert
             # it into an Index and name the final index values according
             # to that column's name.
@@ -836,11 +753,18 @@ def _index_and_downcast(self, result, index, index_key):
             out_index = as_index(last_column)
             out_index.name = index.names[-1]
             index = out_index
-        elif len(out_index.columns) > 1:
+        elif out_index._num_columns > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
             # source index until it has the correct number of columns (n-k)
             result.reset_index(drop=True)
-            index = index._popn(size)
+            if index.names is not None:
+                result.names = index.names[size:]
+            index = MultiIndex(
+                levels=index.levels[size:],
+                codes=index.codes.iloc[:, size:],
+                names=index.names[size:],
+            )
+
         if isinstance(index_key, tuple):
             result = result.set_index(index)
         return result
@@ -896,24 +820,6 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    def _split_tuples(self, tuples):
-        if len(tuples) == 1:
-            return tuples, slice(None)
-        elif isinstance(tuples[0], tuple):
-            row = tuples[0]
-            if len(tuples) == 1:
-                column = slice(None)
-            else:
-                column = tuples[1]
-            return row, column
-        elif isinstance(tuples[0], slice):
-            return tuples
-        else:
-            return tuples, slice(None)
-
-    def __len__(self):
-        return self._data.nrows
-
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             for self_col, other_col in zip(
@@ -924,24 +830,16 @@ def __eq__(self, other):
             return self.names == other.names
         return NotImplemented
 
-    @property
-    def is_contiguous(self):
-        return True
-
     @property
     def size(self):
-        return len(self)
+        # The size of a MultiIndex is only dependent on the number of rows.
+        return self._num_rows
 
     def take(self, indices):
-        from collections.abc import Sequence
-        from numbers import Integral
-
         if isinstance(indices, (Integral, Sequence)):
             indices = np.array(indices)
-        elif isinstance(indices, cudf.Series):
-            if indices.has_nulls:
-                raise ValueError("Column must have no nulls.")
-            indices = indices
+        elif isinstance(indices, cudf.Series) and indices.has_nulls:
+            raise ValueError("Column must have no nulls.")
         elif isinstance(indices, slice):
             start, stop, step = indices.indices(len(self))
             indices = column.arange(start, stop, step)
@@ -983,15 +881,11 @@ def deserialize(cls, header, frames):
         return cls._from_data(dict(zip(names, columns)))
 
     def __getitem__(self, index):
-        match = self.take(index)
-        if isinstance(index, slice):
-            return match
         if isinstance(index, int):
             # we are indexing into a single row of the MultiIndex,
             # return that row as a tuple:
-            return match.to_pandas()[0]
-        else:
-            return match
+            return self.take(index).to_pandas()[0]
+        return self.take(index)
 
     def to_frame(self, index=True, name=None):
         # TODO: Currently this function makes a shallow copy, which is
@@ -1003,7 +897,7 @@ def to_frame(self, index=True, name=None):
         if name is not None:
             if len(name) != len(self.levels):
                 raise ValueError(
-                    "'name' should have th same length as "
+                    "'name' should have the same length as "
                     "number of levels on index."
                 )
             df.columns = name
@@ -1095,8 +989,7 @@ def from_tuples(cls, tuples, names=None):
         """
         # Use Pandas for handling Python host objects
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
-        result = cls.from_pandas(pdi)
-        return result
+        return cls.from_pandas(pdi)
 
     @property
     def values_host(self):
@@ -1426,18 +1319,6 @@ def from_pandas(cls, multiindex, nan_as_null=None):
     def is_unique(self):
         return len(self) == len(self.unique())
 
-    @property
-    def is_monotonic(self):
-        """Return boolean if values in the object are monotonic_increasing.
-
-        This property is an alias for :attr:`is_monotonic_increasing`.
-
-        Returns
-        -------
-        bool
-        """
-        return self.is_monotonic_increasing
-
     @property
     def is_monotonic_increasing(self):
         """
@@ -1539,13 +1420,9 @@ def memory_usage(self, deep=False):
         return n
 
     def difference(self, other, sort=None):
-        temp_self = self
-        temp_other = other
-        if hasattr(self, "to_pandas"):
-            temp_self = self.to_pandas()
         if hasattr(other, "to_pandas"):
-            temp_other = self.to_pandas()
-        return temp_self.difference(temp_other, sort)
+            other = other.to_pandas()
+        return self.to_pandas().difference(other, sort)
 
     def append(self, other):
         """
@@ -1609,12 +1486,6 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    def nan_to_num(*args, **kwargs):
-        return args[0]
-
-    def array_equal(*args, **kwargs):
-        return args[0] == args[1]
-
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1650,8 +1521,8 @@ def _level_index_from_level(self, level):
         try:
             return self.names.index(level)
         except ValueError:
-            if not pd.api.types.is_integer(level):
-                raise KeyError(f"Level {level} not found") from None
+            if not is_integer(level):
+                raise KeyError(f"Level {level} not found")
             if level < 0:
                 level += self.nlevels
             if level >= self.nlevels:
@@ -1661,9 +1532,6 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    def _level_name_from_level(self, level):
-        return self.names[self._level_index_from_level(level)]
-
     def get_loc(self, key, method=None, tolerance=None):
         """
         Get location for a label or a tuple of labels.
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 465cf36e1f3..40bbdc4a865 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -1553,3 +1553,20 @@ def test_multiIndex_duplicate_names():
     )
 
     assert_eq(gi, pi)
+
+
+def test_difference():
+    midx = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
+        names=["x", "y"],
+    )
+    midx2 = cudf.MultiIndex(
+        levels=[[1, 3, 4, 5], [1, 2, 5]],
+        codes=[[0, 0, 1, 2, 3, 3], [0, 2, 1, 1, 0, 2]],
+        names=["x", "y"],
+    )
+
+    expected = midx2.to_pandas().difference(midx.to_pandas())
+    actual = midx2.difference(midx)
+    assert_eq(expected, actual)

From a4771b3307b93f391dbeb647afb146c8afb7f060 Mon Sep 17 00:00:00 2001
From: Peixin <pxli@nyu.edu>
Date: Wed, 22 Sep 2021 23:03:33 +0800
Subject: [PATCH 08/26] Update cudf java bindings to 21.12.0-SNAPSHOT (#9248)

Signed-off-by: Peixin Li <pxli@nyu.edu>

update cudf JNI version to 21.12.0

Authors:
  - Peixin (https://github.com/pxLi)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9248
---
 java/ci/README.md                   | 4 ++--
 java/pom.xml                        | 2 +-
 java/src/main/native/CMakeLists.txt | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/ci/README.md b/java/ci/README.md
index ef3a329f7f6..5432dc8d0f1 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.2.2-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.10
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-21.12
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,5 +47,5 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-21.10.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-21.12.0-SNAPSHOT-cuda11.jar.
 
diff --git a/java/pom.xml b/java/pom.xml
index 1b4a31116d4..db79f94009b 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>21.10.0-SNAPSHOT</version>
+    <version>21.12.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 3aa9f14bac4..2c95c6eebac 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -15,7 +15,7 @@
 #=============================================================================
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
      ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 

From 9da7c01bf394243ae37319277e83a8edda3b4c70 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 22 Sep 2021 12:37:56 -0400
Subject: [PATCH 09/26] Fix call to thrust::reduce_by_key in argmin/argmax
 libcudf groupby (#9263)

Closes #9156

This PR simplifies the parameters when calling thrust::reduce_by_key for the argmin/argmax aggregations in cudf::groupby. The illegalMemoryAccess found in #9156 was due to invalid data being passed from thrust::reduce_by_key through to the BinaryPredicate function as documented in NVIDIA/thrust#1525

The invalid data being passed is only a real issue for strings columns where the device pointer was neither nullptr nor a valid address. The new logic provides only size_type values to thrust::reduce_by_key so invalid values can only be out-of-bounds for the input column which is easily checked before retrieving the string_view objects within the ArgMin and ArgMax operators.

This the same as #9244 but based on 21.10

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/9263
---
 .../sort/group_single_pass_reduction_util.cuh | 96 ++++++-------------
 1 file changed, 30 insertions(+), 66 deletions(-)

diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 8eccadd653e..db2ae5b5d8e 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -31,77 +31,50 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
 
 namespace cudf {
 namespace groupby {
 namespace detail {
 
-// ArgMin binary operator with tuple of (value, index)
+/**
+ * @brief ArgMin binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
+ */
 template <typename T>
 struct ArgMin {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) < thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-// ArgMax binary operator with tuple of (value, index)
-template <typename T>
-struct ArgMax {
-  CUDA_HOST_DEVICE_CALLABLE auto operator()(thrust::tuple<T, size_type> const& lhs,
-                                            thrust::tuple<T, size_type> const& rhs) const
-  {
-    if (thrust::get<1>(lhs) == cudf::detail::ARGMIN_SENTINEL)
-      return rhs;
-    else if (thrust::get<1>(rhs) == cudf::detail::ARGMIN_SENTINEL)
-      return lhs;
-    else
-      return thrust::get<0>(lhs) > thrust::get<0>(rhs) ? lhs : rhs;
-  }
-};
-
-struct get_tuple_second_element {
-  template <typename T>
-  __device__ size_type operator()(thrust::tuple<T, size_type> const& rhs) const
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    return thrust::get<1>(rhs);
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
   }
 };
 
 /**
- * @brief Functor to store the boolean value to null mask.
+ * @brief ArgMax binary operator with index values into input column.
+ *
+ * @tparam T Type of the underlying column. Must support '<' operator.
  */
-struct bool_to_nullmask {
-  mutable_column_device_view d_result;
-  __device__ void operator()(size_type i, bool rhs)
+template <typename T>
+struct ArgMax {
+  column_device_view const d_col;
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
   {
-    if (rhs) {
-      d_result.set_valid(i);
-    } else {
-      d_result.set_null(i);
-    }
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
+    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
+    return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
   }
 };
 
-/**
- * @brief Returns index for non-null element, and SENTINEL for null element in a column.
- *
- */
-struct null_as_sentinel {
-  column_device_view const col;
-  size_type const SENTINEL;
-  __device__ size_type operator()(size_type i) const { return col.is_null(i) ? SENTINEL : i; }
-};
-
 /**
  * @brief Value accessor for column which supports dictionary column too.
  *
@@ -191,25 +164,16 @@ struct reduce_functor {
     auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
     auto valuesview = column_device_view::create(values, stream);
     if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
-      constexpr auto SENTINEL =
-        (K == aggregation::ARGMAX ? cudf::detail::ARGMAX_SENTINEL : cudf::detail::ARGMIN_SENTINEL);
-      auto idx_begin =
-        cudf::detail::make_counting_transform_iterator(0, null_as_sentinel{*valuesview, SENTINEL});
-      // dictionary keys are sorted, so dictionary32 index comparison is enough.
-      auto column_begin = valuesview->begin<DeviceType>();
-      auto begin        = thrust::make_zip_iterator(thrust::make_tuple(column_begin, idx_begin));
-      auto result_begin = thrust::make_transform_output_iterator(resultview->begin<ResultDType>(),
-                                                                 get_tuple_second_element{});
       using OpType =
         std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            begin,
+                            thrust::make_counting_iterator<ResultType>(0),
                             thrust::make_discard_iterator(),
-                            result_begin,
-                            thrust::equal_to<size_type>{},
-                            OpType{});
+                            resultview->begin<ResultType>(),
+                            thrust::equal_to<ResultType>{},
+                            OpType{*valuesview});
     } else {
       auto init  = OpType::template identity<DeviceType>();
       auto begin = cudf::detail::make_counting_transform_iterator(

From 20713dff41fd6668e6e631c148e86424597b4934 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 22 Sep 2021 09:55:34 -0700
Subject: [PATCH 10/26] Explicit about bitwidth difference between cudf boolean
 and arrow boolean (#9192)

Currently, we map boolean type to `pa.int8` because the bitwidth of cudf boolean mismatches that in arrow. However the implication of this mapping is subtle and may cause unwanted result such as:

```python
>>> cudf.StructDtype({
    "a": np.bool_,
    "b": np.int8,
})
StructDtype({'a': dtype('int8'), 'b': dtype('int8')})
```

This PR changes the mapping back to `pa.bool_`, and use explicit type handling when we are dealing with type conversion to arrow.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - https://github.com/brandon-b-miller
  - H. Thomson Comer (https://github.com/thomcom)

URL: https://github.com/rapidsai/cudf/pull/9192
---
 python/cudf/cudf/_lib/utils.pyx        | 20 ++++++++++++++++++--
 python/cudf/cudf/core/column/column.py |  5 +----
 python/cudf/cudf/tests/test_dtypes.py  |  9 +++++++++
 python/cudf/cudf/utils/dtypes.py       |  7 ++++++-
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index dd12c92a15a..810cdd51df5 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 
 import cudf
@@ -81,7 +82,14 @@ cpdef generate_pandas_metadata(Table table, index):
         ):
             types.append(col.dtype.to_arrow())
         else:
-            types.append(np_to_pa_dtype(col.dtype))
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interperable
+            # in arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
 
     # Indexes
     if index is not False:
@@ -125,7 +133,15 @@ cpdef generate_pandas_metadata(Table table, index):
                 elif is_list_dtype(idx):
                     types.append(col.dtype.to_arrow())
                 else:
-                    types.append(np_to_pa_dtype(idx.dtype))
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
                 index_levels.append(idx)
             col_names.append(name)
             index_descriptors.append(descr)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 8f18d83eb31..de278db919d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2089,10 +2089,7 @@ def as_column(
                             data
                         )
                     np_type = np.dtype(dtype).type
-                    if np_type == np.bool_:
-                        pa_type = pa.bool_()
-                    else:
-                        pa_type = np_to_pa_dtype(np.dtype(dtype))
+                    pa_type = np_to_pa_dtype(np.dtype(dtype))
                 data = as_column(
                     pa.array(
                         arbitrary,
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index d98ab0504cc..877cec24afa 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -324,3 +324,12 @@ def test_dtype(in_dtype, expect):
 def test_dtype_raise(in_dtype):
     with pytest.raises(TypeError):
         cudf.dtype(in_dtype)
+
+
+def test_dtype_np_bool_to_pa_bool():
+    """This test case captures that utility np_to_pa_dtype
+    should map np.bool_ to pa.bool_, nuances on bit width
+    difference should be handled elsewhere.
+    """
+
+    assert np_to_pa_dtype(np.dtype("bool")) == pa.bool_()
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 5100f1a9c49..bdaf5e144a5 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -14,6 +14,11 @@
 from cudf.core._compat import PANDAS_GE_120
 
 _NA_REP = "<NA>"
+
+"""Map numpy dtype to pyarrow types.
+Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
+handling is required when converting a Boolean column into arrow.
+"""
 _np_pa_dtypes = {
     np.float64: pa.float64(),
     np.float32: pa.float32(),
@@ -22,7 +27,7 @@
     np.int32: pa.int32(),
     np.int16: pa.int16(),
     np.int8: pa.int8(),
-    np.bool_: pa.int8(),
+    np.bool_: pa.bool_(),
     np.uint64: pa.uint64(),
     np.uint32: pa.uint32(),
     np.uint16: pa.uint16(),

From b0c8bbbab0c819881fab1323fb68f70971a224d7 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 22 Sep 2021 12:21:31 -0500
Subject: [PATCH 11/26] Fix Java column leak in testParquetWriteMap (#9271)

Fixes a Java column vector leak in TableTest#testParquetWriteMap.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/9271
---
 java/src/test/java/ai/rapids/cudf/TableTest.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index cd1e433d07b..b69dce57180 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6669,8 +6669,9 @@ void testParquetWriteMap() throws IOException {
     HostColumnVector.StructType structType = new HostColumnVector.StructType(true,
      Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
         new HostColumnVector.BasicType(true, DType.STRING)));
-    try (Table t0 = new Table(ColumnVector.fromLists(new HostColumnVector.ListType(true,
-     structType), list1, list2, list3))) {
+    try (ColumnVector listColumn = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            structType), list1, list2, list3);
+         Table t0 = new Table(listColumn)) {
       try (TableWriter writer = Table.writeParquetChunked(options, f)) {
         writer.write(t0);
       }

From ef5ba4cee31a5f335314b5ceec9d0db473aef7a0 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Wed, 22 Sep 2021 14:31:11 -0400
Subject: [PATCH 12/26] Fixing empty input to getMapValue crashing (#9262)

This changes the calls in java/cudf to check for an empty input and return an empty result instead of crashing.

Fixes #9253

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9262
---
 java/src/main/native/src/map_lookup.cu                |  6 +++++-
 .../test/java/ai/rapids/cudf/ColumnVectorTest.java    | 11 +++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/java/src/main/native/src/map_lookup.cu b/java/src/main/native/src/map_lookup.cu
index ad791747713..683651799e7 100644
--- a/java/src/main/native/src/map_lookup.cu
+++ b/java/src/main/native/src/map_lookup.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -183,6 +183,10 @@ std::unique_ptr<column> map_lookup(column_view const &map_column, string_scalar
   // Defensive checks.
   map_input_check(map_column, stream);
 
+  if (map_column.size() == 0) {
+    return make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
+
   lists_column_view lcv{map_column};
   column_view structs_column = lcv.get_sliced_child(stream);
   // Two-pass plan: construct gather map, and then gather() on structs_column.child(1). Plan A.
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 0643776a546..d1af0d9a2f6 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -5412,6 +5412,17 @@ void testGetMapValue() {
     }
   }
 
+  @Test
+  void testGetMapValueEmptyInput() {
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true, Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+        new HostColumnVector.BasicType(true, DType.STRING)));
+    try (ColumnVector cv = ColumnVector.fromLists(new HostColumnVector.ListType(true, structType));
+         ColumnVector res = cv.getMapValue(Scalar.fromString("a"));
+         ColumnVector expected = ColumnVector.fromStrings()) {
+      assertColumnsAreEqual(expected, res);
+    }
+  }
+
   @Test
   void testGetMapKeyExistence() {
     List<HostColumnVector.StructData> list1 = Arrays.asList(new HostColumnVector.StructData("a", "b"));

From 08cbbcdcea2c9fb18e5614f5e29ba99b5443d38f Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Thu, 23 Sep 2021 00:28:03 +0530
Subject: [PATCH 13/26] Use nvcomp's snappy compressor in ORC writer (#9242)

Issue #9205

depends on #9235

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - https://github.com/nvdbaranec
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9242
---
 cpp/src/io/orc/orc_common.h   |  3 +-
 cpp/src/io/orc/orc_gpu.h      |  6 ++-
 cpp/src/io/orc/stripe_enc.cu  | 96 ++++++++++++++++++++++++++++++-----
 cpp/src/io/orc/stripe_init.cu | 16 +++---
 cpp/src/io/orc/writer_impl.cu | 51 ++++++++++---------
 5 files changed, 126 insertions(+), 46 deletions(-)

diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h
index ab6788d01f1..eedaa9d4fc2 100644
--- a/cpp/src/io/orc/orc_common.h
+++ b/cpp/src/io/orc/orc_common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ namespace orc {
 
 // ORC rows are divided into groups and assigned indexes for faster seeking
 static constexpr uint32_t default_row_index_stride = 10000;
+static constexpr uint32_t BLOCK_HEADER_SIZE        = 3;
 
 enum CompressionKind : uint8_t {
   NONE   = 0,
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 30687331c15..88d7e26b3b6 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -355,6 +355,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
  * @param[in] num_compressed_blocks Total number of compressed blocks
  * @param[in] compression Type of compression
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  * @param[in,out] strm_desc StripeStream device array [stripe][stream]
  * @param[in,out] enc_streams chunk streams device array [column][rowgroup]
  * @param[out] comp_in Per-block compression input parameters
@@ -365,10 +366,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index d50d3898c3b..9348d817dad 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -24,6 +24,9 @@
 
 #include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <nvcomp/snappy.h>
 
 namespace cudf {
 namespace io {
@@ -1102,15 +1105,17 @@ __global__ void __launch_bounds__(1024)
  * @param[out] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {256,1,1}
 __global__ void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
                            device_2dspan<encoder_chunk_streams> streams,  // const?
-                           gpu_inflate_input_s* comp_in,
-                           gpu_inflate_status_s* comp_out,
+                           device_span<gpu_inflate_input_s> comp_in,
+                           device_span<gpu_inflate_status_s> comp_out,
                            uint8_t* compressed_bfr,
-                           uint32_t comp_blk_size)
+                           uint32_t comp_blk_size,
+                           uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ uint8_t* volatile uncomp_base_g;
@@ -1135,8 +1140,8 @@ __global__ void __launch_bounds__(256)
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
     blk_in->srcDevice = src + b * comp_blk_size;
     blk_in->srcSize   = blk_size;
-    blk_in->dstDevice = dst + b * (3 + comp_blk_size) + 3;  // reserve 3 bytes for block header
-    blk_in->dstSize   = blk_size;
+    blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE;
+    blk_in->dstSize   = max_comp_blk_size;
     blk_out->bytes_written = blk_size;
     blk_out->status        = 1;
     blk_out->reserved      = 0;
@@ -1153,14 +1158,16 @@ __global__ void __launch_bounds__(256)
  * @param[in] comp_out Per-block compression status
  * @param[in] compressed_bfr Compression output buffer
  * @param[in] comp_blk_size Compression block size
+ * @param[in] max_comp_blk_size Max size of any block after compression
  */
 // blockDim {1024,1,1}
 __global__ void __launch_bounds__(1024)
   gpuCompactCompressedBlocks(device_2dspan<StripeStream> strm_desc,
-                             gpu_inflate_input_s* comp_in,
-                             gpu_inflate_status_s* comp_out,
+                             device_span<gpu_inflate_input_s> comp_in,
+                             device_span<gpu_inflate_status_s> comp_out,
                              uint8_t* compressed_bfr,
-                             uint32_t comp_blk_size)
+                             uint32_t comp_blk_size,
+                             uint32_t max_comp_blk_size)
 {
   __shared__ __align__(16) StripeStream ss;
   __shared__ const uint8_t* volatile comp_src_g;
@@ -1271,20 +1278,83 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
                             uint32_t num_compressed_blocks,
                             CompressionKind compression,
                             uint32_t comp_blk_size,
+                            uint32_t max_comp_blk_size,
                             device_2dspan<StripeStream> strm_desc,
                             device_2dspan<encoder_chunk_streams> enc_streams,
-                            gpu_inflate_input_s* comp_in,
-                            gpu_inflate_status_s* comp_out,
+                            device_span<gpu_inflate_input_s> comp_in,
+                            device_span<gpu_inflate_status_s> comp_out,
                             rmm::cuda_stream_view stream)
 {
   dim3 dim_block_init(256, 1);
   dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
   gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
-    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size);
-  if (compression == SNAPPY) { gpu_snap(comp_in, comp_out, num_compressed_blocks, stream); }
+    strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
+  if (compression == SNAPPY) {
+    auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
+    bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
+    if (use_nvcomp) {
+      try {
+        size_t temp_size;
+        nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
+          num_compressed_blocks, comp_blk_size, nvcompBatchedSnappyDefaultOpts, &temp_size);
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+                     "Error in getting snappy compression scratch size");
+
+        rmm::device_buffer scratch(temp_size, stream);
+        rmm::device_uvector<void const*> uncompressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> uncompressed_data_sizes(num_compressed_blocks, stream);
+        rmm::device_uvector<void*> compressed_data_ptrs(num_compressed_blocks, stream);
+        rmm::device_uvector<size_t> compressed_bytes_written(num_compressed_blocks, stream);
+
+        auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(),
+                                                 uncompressed_data_sizes.begin(),
+                                                 compressed_data_ptrs.begin());
+        thrust::transform(rmm::exec_policy(stream),
+                          comp_in.begin(),
+                          comp_in.end(),
+                          comp_it,
+                          [] __device__(gpu_inflate_input_s in) {
+                            return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice);
+                          });
+        nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(),
+                                                         uncompressed_data_sizes.data(),
+                                                         max_comp_blk_size,
+                                                         num_compressed_blocks,
+                                                         scratch.data(),
+                                                         scratch.size(),
+                                                         compressed_data_ptrs.data(),
+                                                         compressed_bytes_written.data(),
+                                                         nvcompBatchedSnappyDefaultOpts,
+                                                         stream.value());
+
+        CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "Error in snappy compression");
+
+        thrust::transform(rmm::exec_policy(stream),
+                          compressed_bytes_written.begin(),
+                          compressed_bytes_written.end(),
+                          comp_out.begin(),
+                          [] __device__(size_t size) {
+                            gpu_inflate_status_s status{};
+                            status.bytes_written = size;
+                            return status;
+                          });
+      } catch (...) {
+        // If we reach this then there was an error in compressing so set an error status for each
+        // block
+        thrust::for_each(rmm::exec_policy(stream),
+                         comp_out.begin(),
+                         comp_out.end(),
+                         [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; });
+      };
+
+    } else {
+      gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream);
+    }
+  }
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
-    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size);
+    strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 94d8de6561b..d6dbdbe6403 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,13 +52,13 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat
     uint32_t max_uncompressed_block_size = 0;
     uint32_t num_compressed_blocks       = 0;
     uint32_t num_uncompressed_blocks     = 0;
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size;
       gpu_inflate_input_s* init_ctl = nullptr;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (block_len > block_size || cur + block_len > end) {
         // Fatal
         num_compressed_blocks       = 0;
@@ -145,12 +145,12 @@ extern "C" __global__ void __launch_bounds__(128, 8)
     uint32_t num_compressed_blocks      = 0;
     uint32_t max_compressed_blocks      = s->info.num_compressed_blocks;
 
-    while (cur + 3 < end) {
+    while (cur + BLOCK_HEADER_SIZE < end) {
       uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0);
       uint32_t is_uncompressed = block_len & 1;
       uint32_t uncompressed_size_est, uncompressed_size_actual;
       block_len >>= 1;
-      cur += 3;
+      cur += BLOCK_HEADER_SIZE;
       if (cur + block_len > end) { break; }
       if (is_uncompressed) {
         uncompressed_size_est    = block_len;
@@ -367,9 +367,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s,
       for (;;) {
         uint32_t block_len, is_uncompressed;
 
-        if (cur + 3 > end || cur + 3 >= start + compressed_offset) { break; }
+        if (cur + BLOCK_HEADER_SIZE > end || cur + BLOCK_HEADER_SIZE >= start + compressed_offset) {
+          break;
+        }
         block_len = cur[0] | (cur[1] << 8) | (cur[2] << 16);
-        cur += 3;
+        cur += BLOCK_HEADER_SIZE;
         is_uncompressed = block_len & 1;
         block_len >>= 1;
         cur += block_len;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e0018ed7166..8a0112deb76 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -36,6 +36,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <nvcomp/snappy.h>
+
 #include <algorithm>
 #include <cstring>
 #include <numeric>
@@ -999,10 +1001,10 @@ void writer::impl::write_index_stream(int32_t stripe_id,
       record.pos += stream.lengths[type];
       while ((record.pos >= 0) && (record.blk_pos >= 0) &&
              (static_cast<size_t>(record.pos) >= compression_blocksize_) &&
-             (record.comp_pos + 3 + comp_out[record.blk_pos].bytes_written <
+             (record.comp_pos + BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written <
               static_cast<size_t>(record.comp_size))) {
         record.pos -= compression_blocksize_;
-        record.comp_pos += 3 + comp_out[record.blk_pos].bytes_written;
+        record.comp_pos += BLOCK_HEADER_SIZE + comp_out[record.blk_pos].bytes_written;
         record.blk_pos += 1;
       }
     }
@@ -1472,29 +1474,31 @@ void writer::impl::write(table_view const& table)
   }
 
   // Allocate intermediate output stream buffer
-  size_t compressed_bfr_size   = 0;
-  size_t num_compressed_blocks = 0;
-  auto stream_output           = [&]() {
+  size_t compressed_bfr_size       = 0;
+  size_t num_compressed_blocks     = 0;
+  size_t max_compressed_block_size = 0;
+  if (compression_kind_ != NONE) {
+    nvcompBatchedSnappyCompressGetMaxOutputChunkSize(
+      compression_blocksize_, nvcompBatchedSnappyDefaultOpts, &max_compressed_block_size);
+  }
+  auto stream_output = [&]() {
     size_t max_stream_size = 0;
     bool all_device_write  = true;
 
-    for (size_t stripe_id = 0; stripe_id < segmentation.num_stripes(); stripe_id++) {
-      for (size_t i = 0; i < num_data_streams; i++) {  // TODO range for (at least)
-        gpu::StripeStream* ss = &strm_descs[stripe_id][i];
-        if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
-        size_t stream_size = ss->stream_size;
-        if (compression_kind_ != NONE) {
-          ss->first_block = num_compressed_blocks;
-          ss->bfr_offset  = compressed_bfr_size;
-
-          auto num_blocks = std::max<uint32_t>(
-            (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
-          stream_size += num_blocks * 3;
-          num_compressed_blocks += num_blocks;
-          compressed_bfr_size += stream_size;
-        }
-        max_stream_size = std::max(max_stream_size, stream_size);
+    for (auto& ss : strm_descs.host_view().flat_view()) {
+      if (!out_sink_->is_device_write_preferred(ss.stream_size)) { all_device_write = false; }
+      size_t stream_size = ss.stream_size;
+      if (compression_kind_ != NONE) {
+        ss.first_block = num_compressed_blocks;
+        ss.bfr_offset  = compressed_bfr_size;
+
+        auto num_blocks = std::max<uint32_t>(
+          (stream_size + compression_blocksize_ - 1) / compression_blocksize_, 1);
+        stream_size += num_blocks * BLOCK_HEADER_SIZE;
+        num_compressed_blocks += num_blocks;
+        compressed_bfr_size += (max_compressed_block_size + BLOCK_HEADER_SIZE) * num_blocks;
       }
+      max_stream_size = std::max(max_stream_size, stream_size);
     }
 
     if (all_device_write) {
@@ -1519,10 +1523,11 @@ void writer::impl::write(table_view const& table)
                                 num_compressed_blocks,
                                 compression_kind_,
                                 compression_blocksize_,
+                                max_compressed_block_size,
                                 strm_descs,
                                 enc_data.streams,
-                                comp_in.device_ptr(),
-                                comp_out.device_ptr(),
+                                comp_in,
+                                comp_out,
                                 stream);
     strm_descs.device_to_host(stream);
     comp_out.device_to_host(stream, true);

From 10fd071dc12e35f02192d7bdd14af03221bb2ae9 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 22 Sep 2021 12:05:14 -0700
Subject: [PATCH 14/26] Add `dseries.struct.explode` (#9086)

Closes #8660

Per discussions in thread #8872 , this PR adds a struct-accessor member function to provide a lateral view to a struct type series.

Example:
```python
>>> import cudf, dask_cudf as dgd
>>> ds = dgd.from_cudf(cudf.Series(
...     [{'a': 42, 'b': 'str1', 'c': [-1]},
...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
>>> ds.struct.explode().compute()
    a     b           c
0  42  str1        [-1]
1   0  str2  [400, 500]
2   7                []
```

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/9086
---
 python/dask_cudf/dask_cudf/accessors.py       | 26 +++++++++++++++++++
 .../dask_cudf/tests/test_accessor.py          | 15 +++++++++++
 2 files changed, 41 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py
index 77973ee34ff..1c21fca51c8 100644
--- a/python/dask_cudf/dask_cudf/accessors.py
+++ b/python/dask_cudf/dask_cudf/accessors.py
@@ -37,6 +37,32 @@ def field(self, key):
             meta=self.d_series._meta._constructor([], dtype=typ),
         )
 
+    def explode(self):
+        """
+        Creates a dataframe view of the struct column, one column per field.
+
+        Returns
+        -------
+        DataFrame
+
+        Examples
+        --------
+        >>> import cudf, dask_cudf
+        >>> ds = dask_cudf.from_cudf(cudf.Series(
+        ...     [{'a': 42, 'b': 'str1', 'c': [-1]},
+        ...      {'a': 0,  'b': 'str2', 'c': [400, 500]},
+        ...      {'a': 7,  'b': '',     'c': []}]), npartitions=2)
+        >>> ds.struct.explode().compute()
+            a     b           c
+        0  42  str1        [-1]
+        1   0  str2  [400, 500]
+        2   7                []
+        """
+        return self.d_series.map_partitions(
+            lambda s: s.struct.explode(),
+            meta=self.d_series._meta.struct.explode(),
+        )
+
 
 class ListMethods:
     def __init__(self, d_series):
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 805927dd474..1521ce41806 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -499,3 +499,18 @@ def test_dask_struct_field_Int_Error(data):
 
     with pytest.raises(IndexError):
         got.struct.field(1000).compute()
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [{}, {}, {}],
+        [{"a": 100, "b": "abc"}, {"a": 42, "b": "def"}, {"a": -87, "b": ""}],
+        [{"a": [1, 2, 3], "b": {"c": 101}}, {"a": [4, 5], "b": {"c": 102}}],
+    ],
+)
+def test_struct_explode(data):
+    expect = Series(data).struct.explode()
+    got = dgd.from_cudf(Series(data), 2).struct.explode()
+
+    assert_eq(expect, got.compute())

From 1cb527f01aac631f4d44866b5474e503501d58cd Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 23 Sep 2021 01:26:59 +0530
Subject: [PATCH 15/26] Add shallow hash function and shallow equality
 comparison for column_view (#9185)

Fixes #9140
Added `shallow_hash(column_view)`
Added unit tests

It computes hash values based on the shallow states of `column_view`:
type, size, data pointer, null_mask pointer,  offset, and the hash value of the children.
`null_count` is not used since it is a cached value and it may vary based on contents of `null_mask`, and may be pre-computed or not.

Fixes #9139
Added `is_shallow_equivalent(column_view, column_view)` ~shallow_equal~
Added unit tests

It compares two column_views based on the shallow states of column_view:
type, size, data pointer, null_mask pointer, offset, and the column_view of the children.
null_count is not used since it is a cached value and it may vary based on contents of null_mask, and may be pre-computed or not.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Hemstad (https://github.com/jrhemstad)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9185
---
 cpp/include/cudf/column/column_view.hpp       |  41 ++
 cpp/include/cudf/detail/hashing.hpp           |  36 ++
 .../cudf/detail/utilities/hash_functions.cuh  |  12 +
 cpp/include/cudf_test/type_lists.hpp          |  12 +
 cpp/src/column/column_view.cpp                |  55 +++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/column/column_view_shallow_test.cpp | 442 ++++++++++++++++++
 7 files changed, 599 insertions(+)
 create mode 100644 cpp/tests/column/column_view_shallow_test.cpp

diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 7feaeafbad0..cd490c3c832 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -633,4 +633,45 @@ column_view bit_cast(column_view const& input, data_type type);
  */
 mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
+namespace detail {
+/**
+ * @brief Computes a hash value from the shallow state of the specified column
+ *
+ * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) ==
+ * shallow_hash(c1)`.
+ *
+ * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e.,
+ * it is independent of the number of elements in the column.
+ *
+ * This function does _not_ inspect the elements of `input` nor access any device memory or launch
+ * any kernels.
+ *
+ * @param input The `column_view` to compute hash
+ * @return The hash value derived from the shallow state of `input`.
+ */
+std::size_t shallow_hash(column_view const& input);
+
+/**
+ * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns
+ *
+ *  Two columns are equivalent if for any operation `F` then:
+ *   ```
+ *    is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent
+ *   ```
+ * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact
+ * same physical column. In other words, two physically independent columns may have exactly
+ * equivalent elements but their shallow state would not be equivalent.
+ *
+ * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`,
+ * i.e., it is independent of the number of elements in either column.
+ *
+ * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor
+ * launch any kernels.
+ *
+ * @param lhs The left `column_view` to compare
+ * @param rhs The right `column_view` to compare
+ * @return If `lhs` and `rhs` have equivalent shallow state
+ */
+bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
+}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index 83d6be14709..bd5c8a42a51 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -19,6 +19,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cstddef>
+#include <functional>
+
 namespace cudf {
 namespace detail {
 
@@ -53,5 +56,38 @@ std::unique_ptr<column> serial_murmur_hash3_32(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/* Copyright 2005-2014 Daniel James.
+ *
+ * Use, modification and distribution is subject to the Boost Software
+ * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+ * http://www.boost.org/LICENSE_1_0.txt)
+ */
+/**
+ * @brief Combines two hashed values into a single hashed value.
+ *
+ * Adapted from Boost hash_combine function, modified for 64-bit
+ * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
+ *
+ * @param lhs The first hashed value
+ * @param rhs The second hashed value
+ * @return Combined hash value
+ */
+constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
+{
+  lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
+  return lhs;
+}
 }  // namespace detail
 }  // namespace cudf
+
+// specialization of std::hash for cudf::data_type
+namespace std {
+template <>
+struct hash<cudf::data_type> {
+  std::size_t operator()(cudf::data_type const& type) const noexcept
+  {
+    return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
+                                      std::hash<int32_t>{}(type.scale()));
+  }
+};
+}  // namespace std
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 6eab13ae9af..65deadd6cd0 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -395,6 +395,12 @@ struct MurmurHash3_32 {
     return h;
   }
 
+  /* Copyright 2005-2014 Daniel James.
+   *
+   * Use, modification and distribution is subject to the Boost Software
+   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+   * http://www.boost.org/LICENSE_1_0.txt)
+   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
@@ -795,6 +801,12 @@ struct IdentityHash {
   IdentityHash()    = default;
   constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}
 
+  /* Copyright 2005-2014 Daniel James.
+   *
+   * Use, modification and distribution is subject to the Boost Software
+   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
+   * http://www.boost.org/LICENSE_1_0.txt)
+   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 74688b7f133..982c94ac402 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -315,6 +315,18 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
  */
 using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
 
+/**
+ * @brief Provides a list of all compound types for use in GTest typed tests.
+ *
+ * Example:
+ * ```
+ * // Invokes all typed fixture tests for all compound types in libcudf
+ * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
+ * ```
+ */
+using CompoundTypes =
+  cudf::test::Types<cudf::string_view, cudf::dictionary32, cudf::list_view, cudf::struct_view>;
+
 /**
  * @brief Provides a list of all types supported in libcudf for use in a GTest
  * typed test.
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 186669ae697..5749cb48c0e 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/hashing.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -22,6 +23,7 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <algorithm>
 #include <exception>
 #include <numeric>
 #include <vector>
@@ -76,6 +78,59 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
            ? 0
            : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
 }
+
+// Struct to use custom hash combine and fold expression
+struct HashValue {
+  std::size_t hash;
+  explicit HashValue(std::size_t h) : hash{h} {}
+  HashValue operator^(HashValue const& other) const
+  {
+    return HashValue{hash_combine(hash, other.hash)};
+  }
+};
+
+template <typename... Ts>
+constexpr auto hash(Ts&&... ts)
+{
+  return (... ^ HashValue(std::hash<Ts>{}(ts))).hash;
+}
+
+std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false)
+{
+  std::size_t const init = (is_parent_empty or c.is_empty())
+                             ? hash(c.type(), 0)
+                             : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset());
+  return std::accumulate(c.child_begin(),
+                         c.child_end(),
+                         init,
+                         [&c, is_parent_empty](std::size_t hash, auto const& child) {
+                           return hash_combine(
+                             hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
+                         });
+}
+
+std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
+
+bool shallow_equivalent_impl(column_view const& lhs,
+                             column_view const& rhs,
+                             bool is_parent_empty = false)
+{
+  bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty;
+  return (lhs.type() == rhs.type()) and
+         (is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and
+                       (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and
+         std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [is_empty](auto const& lhs_child, auto const& rhs_child) {
+                      return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
+                    });
+}
+bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
+{
+  return shallow_equivalent_impl(lhs, rhs);
+}
 }  // namespace detail
 
 // Immutable view constructor
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 03f7967cee0..cde170fb598 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -33,6 +33,7 @@ endfunction()
 # - column tests ----------------------------------------------------------------------------------
 ConfigureTest(COLUMN_TEST
     column/bit_cast_test.cpp
+    column/column_view_shallow_test.cpp
     column/column_test.cu
     column/column_device_view_test.cu
     column/compound_test.cu)
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
new file mode 100644
index 00000000000..f76f682bb2f
--- /dev/null
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <memory>
+#include <type_traits>
+
+// fixed_width, dict, string, list, struct
+template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  auto begin = thrust::make_counting_iterator(1);
+  auto end   = thrust::make_counting_iterator(16);
+  return cudf::test::fixed_width_column_wrapper<T>(begin, end).release();
+}
+
+template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  return cudf::test::dictionary_column_wrapper<std::string>(
+           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0})
+    .release();
+}
+
+template <typename T,
+          std::enable_if_t<std::is_same_v<T, std::string> or
+                           std::is_same_v<T, cudf::string_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+
+{
+  return cudf::test::strings_column_wrapper(
+           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""})
+    .release();
+}
+
+template <typename T, std::enable_if_t<std::is_same_v<T, cudf::list_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  return cudf::test::lists_column_wrapper<int>({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release();
+}
+
+template <typename T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>* = nullptr>
+std::unique_ptr<cudf::column> example_column()
+{
+  auto begin    = thrust::make_counting_iterator(1);
+  auto end      = thrust::make_counting_iterator(16);
+  auto member_0 = cudf::test::fixed_width_column_wrapper<int32_t>(begin, end);
+  auto member_1 = cudf::test::fixed_width_column_wrapper<int32_t>(begin + 10, end + 10);
+  return cudf::test::structs_column_wrapper({member_0, member_1}).release();
+}
+
+template <typename T>
+struct ColumnViewShallowTests : public cudf::test::BaseFixture {
+};
+
+using AllTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::CompoundTypes>;
+TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes);
+
+// Test for fixed_width, dict, string, list, struct
+// column_view, column_view = same hash.
+// column_view, make a copy = same hash.
+// new column_view from colmn = same hash
+// column_view, copy column = diff hash
+// column_view, diff column = diff hash.
+//
+// column_view old, update data + new column_view     = same hash.
+// column_view old, add null_mask + new column_view   = diff hash.
+// column_view old, update nulls + new column_view    = same hash.
+// column_view old, set_null_count + new column_view  = same hash.
+//
+// column_view, sliced[0, size) = same hash (for split too)
+// column_view, sliced[n:)      = diff hash (for split too)
+// column_view, bit_cast        = diff hash
+//
+// mutable_column_view, column_view = same hash
+// mutable_column_view, modified mutable_column_view = same hash
+//
+// update the children column data  = same hash
+// update the children column_views = diff hash
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_basic)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // same = same hash
+  {
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view));
+  }
+  // copy column_view = same hash
+  {
+    auto col_view_copy = col_view;
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy));
+  }
+
+  // new column_view from column = same hash
+  {
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+
+  // copy column = diff hash
+  {
+    auto col_new       = std::make_unique<cudf::column>(*col);
+    auto col_view_copy = col_new->view();
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy));
+  }
+
+  // column_view, diff column = diff hash.
+  {
+    auto col_diff      = example_column<TypeParam>();
+    auto col_view_diff = cudf::column_view{*col_diff};
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff));
+  }
+}
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // update data + new column_view = same hash.
+  {
+    // update data by modifying some bits: fixed_width, string, dict, list, struct
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    }
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+  // add null_mask + new column_view = diff hash.
+  {
+    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
+    col_view_new.null_count();
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2));
+  }
+  col_view = cudf::column_view{*col};  // updating after adding null_mask
+  // update nulls + new column_view = same hash.
+  {
+    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+  }
+  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
+  {
+    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
+    col->set_null_count(col->size());
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2));
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_slice)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // column_view, sliced[0, size)  = same hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0]));
+    auto col_split = cudf::split(col_view, {0});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_split[1]));
+  }
+  // column_view, sliced[n:]       = diff hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_sliced[0]));
+    auto col_split = cudf::split(col_view, {1});
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
+    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1]));
+  }
+  // column_view, col copy sliced[0, 0)  = same hash (empty column)
+  {
+    auto col_new        = std::make_unique<cudf::column>(*col);
+    auto col_new_view   = col_new->view();
+    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+
+    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_sliced[1]));
+    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_sliced[2]));
+    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_new_sliced[0]));
+    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_new_sliced[1]));
+    EXPECT_EQ(shallow_hash(col_sliced[2]), shallow_hash(col_new_sliced[2]));
+  }
+
+  // column_view, bit_cast         = diff hash
+  {
+    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
+      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
+                                         std::make_unsigned_t<TypeParam>,
+                                         std::make_signed_t<TypeParam>>;
+      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
+      auto col_bitcast = cudf::bit_cast(col_view, new_type);
+      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // mutable_column_view, column_view = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_view));
+  }
+  // mutable_column_view, modified mutable_column_view = same hash
+  // update the children column data = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    }
+    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_mutable));
+    auto col_mutable_new = cudf::mutable_column_view{*col};
+    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_mutable_new));
+  }
+  // update the children column_views = diff hash
+  {
+    if constexpr (cudf::is_nested<TypeParam>()) {
+      col->child(0).set_null_mask(
+        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
+      auto col_child_updated = cudf::mutable_column_view{*col};
+      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_child_updated));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_basic)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // same = same hash
+  {
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view));
+  }
+  // copy column_view = same hash
+  {
+    auto col_view_copy = col_view;
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_copy));
+  }
+
+  // new column_view from column = same hash
+  {
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+
+  // copy column = diff hash
+  {
+    auto col_new       = std::make_unique<cudf::column>(*col);
+    auto col_view_copy = col_new->view();
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_copy));
+  }
+
+  // column_view, diff column = diff hash.
+  {
+    auto col_diff      = example_column<TypeParam>();
+    auto col_view_diff = cudf::column_view{*col_diff};
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_diff));
+  }
+}
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // update data + new column_view = same hash.
+  {
+    // update data by modifying some bits: fixed_width, string, dict, list, struct
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 2, 64, true);
+    }
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+  // add null_mask + new column_view = diff hash.
+  {
+    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
+    col_view_new.null_count();
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2));
+  }
+  col_view = cudf::column_view{*col};  // updating after adding null_mask
+  // update nulls + new column_view = same hash.
+  {
+    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+  }
+  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
+  {
+    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
+    auto col_view_new = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
+    col->set_null_count(col->size());
+    auto col_view_new2 = cudf::column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new2));
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_slice)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // column_view, sliced[0, size)  = same hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_sliced[0]));
+    auto col_split = cudf::split(col_view, {0});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_split[1]));
+  }
+  // column_view, sliced[n:]       = diff hash (for split too)
+  {
+    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_sliced[0]));
+    auto col_split = cudf::split(col_view, {1});
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
+    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[1]));
+  }
+  // column_view, col copy sliced[0, 0)  = same hash (empty column)
+  {
+    auto col_new        = std::make_unique<cudf::column>(*col);
+    auto col_new_view   = col_new->view();
+    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
+
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_sliced[1]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_sliced[2]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_new_sliced[0]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_new_sliced[1]));
+    EXPECT_TRUE(is_shallow_equivalent(col_sliced[2], col_new_sliced[2]));
+  }
+
+  // column_view, bit_cast         = diff hash
+  {
+    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
+      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
+                                         std::make_unsigned_t<TypeParam>,
+                                         std::make_signed_t<TypeParam>>;
+      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
+      auto col_bitcast = cudf::bit_cast(col_view, new_type);
+      EXPECT_FALSE(is_shallow_equivalent(col_view, col_bitcast));
+    }
+  }
+}
+
+TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_mutable)
+{
+  using namespace cudf::detail;
+  auto col      = example_column<TypeParam>();
+  auto col_view = cudf::column_view{*col};
+  // mutable_column_view, column_view = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_view));
+  }
+  // mutable_column_view, modified mutable_column_view = same hash
+  // update the children column data = same hash
+  {
+    auto col_mutable = cudf::mutable_column_view{*col};
+    if constexpr (cudf::is_fixed_width<TypeParam>()) {
+      // Update data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    } else {
+      // Update child(0).data
+      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
+      cudf::set_null_mask(data, 1, 32, false);
+    }
+    EXPECT_TRUE(is_shallow_equivalent(col_view, col_mutable));
+    auto col_mutable_new = cudf::mutable_column_view{*col};
+    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_mutable_new));
+  }
+  // update the children column_views = diff hash
+  {
+    if constexpr (cudf::is_nested<TypeParam>()) {
+      col->child(0).set_null_mask(
+        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
+      auto col_child_updated = cudf::mutable_column_view{*col};
+      EXPECT_FALSE(is_shallow_equivalent(col_view, col_child_updated));
+    }
+  }
+}

From 8dea0b10c381f28ba9279c50cdfd18245748fc3d Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 22 Sep 2021 15:31:05 -0500
Subject: [PATCH 16/26] Optimized fsspec data transfer for remote file-systems
 (#9265)

This PR strips the pyarrow-NativeFile component out of #9225 (since those changes are not yet stable).  I feel that it is reasonable to start by merging these fsspec-specific optimizations for 21.10, because they are stable and already result in a significant performance boost over the existing approach to remote storage. I still think it is very important that we eventually plumb NativeFile support into python (cudf and dask_cudf), but we will likely need to target 21.12 for that improvement.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/9265
---
 python/cudf/cudf/io/csv.py               |  10 +-
 python/cudf/cudf/io/parquet.py           | 216 ++++++++++++++++++----
 python/cudf/cudf/tests/test_gcs.py       |   6 +-
 python/cudf/cudf/tests/test_s3.py        |  56 +++++-
 python/cudf/cudf/utils/ioutils.py        | 219 +++++++++++++++++++++--
 python/dask_cudf/dask_cudf/io/parquet.py | 158 ++++++++--------
 6 files changed, 524 insertions(+), 141 deletions(-)

diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 61f3457087c..966ede655c6 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -61,9 +61,17 @@ def read_csv(
         path_or_data=filepath_or_buffer,
         compression=compression,
         iotypes=(BytesIO, StringIO),
+        byte_ranges=[byte_range] if byte_range else None,
+        clip_local_buffer=True if byte_range else False,
         **kwargs,
     )
 
+    # Adjust byte_range for clipped local buffers
+    use_byte_range = byte_range
+    if byte_range and isinstance(filepath_or_buffer, BytesIO):
+        if byte_range[1] == filepath_or_buffer.getbuffer().nbytes:
+            use_byte_range = (0, byte_range[1])
+
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
@@ -91,7 +99,7 @@ def read_csv(
         true_values=true_values,
         false_values=false_values,
         nrows=nrows,
-        byte_range=byte_range,
+        byte_range=use_byte_range,
         skip_blank_lines=skip_blank_lines,
         parse_dates=parse_dates,
         comment=comment,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a0713bbce2e..56cfd563435 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,9 +1,11 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
 
+import io
 import warnings
 from collections import defaultdict
 from uuid import uuid4
 
+import fsspec
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -160,6 +162,129 @@ def read_parquet_metadata(path):
     return num_rows, num_row_groups, col_names
 
 
+def _process_row_groups(paths, fs, filters=None, row_groups=None):
+
+    # The general purpose of this function is to (1) expand
+    # directory input into a list of paths (using the pyarrow
+    # dataset API), and (2) to apply row-group filters.
+
+    # Deal with case that the user passed in a directory name
+    file_list = paths
+    if len(paths) == 1 and ioutils.is_directory(paths[0]):
+        paths = ioutils.stringify_pathlike(paths[0])
+
+    # Convert filters to ds.Expression
+    if filters is not None:
+        filters = pq._filters_to_expression(filters)
+
+    # Initialize ds.FilesystemDataset
+    dataset = ds.dataset(
+        paths, filesystem=fs, format="parquet", partitioning="hive",
+    )
+    file_list = dataset.files
+    if len(file_list) == 0:
+        raise FileNotFoundError(f"{paths} could not be resolved to any files")
+
+    if filters is not None:
+        # Load IDs of filtered row groups for each file in dataset
+        filtered_rg_ids = defaultdict(list)
+        for fragment in dataset.get_fragments(filter=filters):
+            for rg_fragment in fragment.split_by_row_group(filters):
+                for rg_info in rg_fragment.row_groups:
+                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
+
+        # Initialize row_groups to be selected
+        if row_groups is None:
+            row_groups = [None for _ in dataset.files]
+
+        # Store IDs of selected row groups for each file
+        for i, file in enumerate(dataset.files):
+            if row_groups[i] is None:
+                row_groups[i] = filtered_rg_ids[file]
+            else:
+                row_groups[i] = filter(
+                    lambda id: id in row_groups[i], filtered_rg_ids[file]
+                )
+
+    return file_list, row_groups
+
+
+def _get_byte_ranges(file_list, row_groups, columns, fs):
+
+    # This utility is used to collect the footer metadata
+    # from a parquet file. This metadata is used to define
+    # the exact byte-ranges that will be needed to read the
+    # target column-chunks from the file.
+    #
+    # This utility is only used for remote storage.
+    #
+    # The calculated byte-range information is used within
+    # cudf.io.ioutils.get_filepath_or_buffer (which uses
+    # _fsspec_data_transfer to convert non-local fsspec file
+    # objects into local byte buffers).
+
+    if row_groups is None:
+        if columns is None:
+            return None, None, None  # No reason to construct this
+        row_groups = [None for path in file_list]
+
+    # Construct a list of required byte-ranges for every file
+    all_byte_ranges, all_footers, all_sizes = [], [], []
+    for path, rgs in zip(file_list, row_groups):
+
+        # Step 0 - Get size of file
+        if fs is None:
+            file_size = path.size
+        else:
+            file_size = fs.size(path)
+
+        # Step 1 - Get 32 KB from tail of file.
+        #
+        # This "sample size" can be tunable, but should
+        # always be >= 8 bytes (so we can read the footer size)
+        tail_size = min(32_000, file_size)
+        if fs is None:
+            path.seek(file_size - tail_size)
+            footer_sample = path.read(tail_size)
+        else:
+            footer_sample = fs.tail(path, tail_size)
+
+        # Step 2 - Read the footer size and re-read a larger
+        #          tail if necessary
+        footer_size = int.from_bytes(footer_sample[-8:-4], "little")
+        if tail_size < (footer_size + 8):
+            if fs is None:
+                path.seek(file_size - (footer_size + 8))
+                footer_sample = path.read(footer_size + 8)
+            else:
+                footer_sample = fs.tail(path, footer_size + 8)
+
+        # Step 3 - Collect required byte ranges
+        byte_ranges = []
+        md = pq.ParquetFile(io.BytesIO(footer_sample)).metadata
+        for r in range(md.num_row_groups):
+            # Skip this row-group if we are targetting
+            # specific row-groups
+            if rgs is None or r in rgs:
+                row_group = md.row_group(r)
+                for c in range(row_group.num_columns):
+                    column = row_group.column(c)
+                    name = column.path_in_schema
+                    # Skip this column if we are targetting a
+                    # specific columns
+                    if columns is None or name in columns:
+                        file_offset0 = column.dictionary_page_offset
+                        if file_offset0 is None:
+                            file_offset0 = column.data_page_offset
+                        num_bytes = column.total_uncompressed_size
+                        byte_ranges.append((file_offset0, num_bytes))
+
+        all_byte_ranges.append(byte_ranges)
+        all_footers.append(footer_sample)
+        all_sizes.append(file_size)
+    return all_byte_ranges, all_footers, all_sizes
+
+
 @ioutils.doc_read_parquet()
 def read_parquet(
     filepath_or_buffer,
@@ -189,18 +314,66 @@ def read_parquet(
         elif not is_list_like(row_groups[0]):
             row_groups = [row_groups]
 
+    # Check columns input
+    if columns is not None:
+        if not is_list_like(columns):
+            raise ValueError("Expected list like for columns")
+
+    # Start by trying construct a filesystem object, so we
+    # can apply filters on remote file-systems
+    fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
+    filepath_or_buffer = paths if paths else filepath_or_buffer
+    if fs is None and filters is not None:
+        raise ValueError("cudf cannot apply filters to open file objects.")
+
+    # Apply filters now (before converting non-local paths to buffers).
+    # Note that `_process_row_groups` will also expand `filepath_or_buffer`
+    # into a full list of files if it is a directory.
+    if fs is not None:
+        filepath_or_buffer, row_groups = _process_row_groups(
+            filepath_or_buffer, fs, filters=filters, row_groups=row_groups,
+        )
+
+    # Check if we should calculate the specific byte-ranges
+    # needed for each parquet file. We always do this when we
+    # have a file-system object to work with and it is not a
+    # local filesystem object. We can also do it without a
+    # file-system object for `AbstractBufferedFile` buffers
+    byte_ranges, footers, file_sizes = None, None, None
+    need_byte_ranges = fs is not None and not ioutils._is_local_filesystem(fs)
+    if need_byte_ranges or (
+        filepath_or_buffer
+        and isinstance(
+            filepath_or_buffer[0], fsspec.spec.AbstractBufferedFile,
+        )
+    ):
+        byte_ranges, footers, file_sizes = _get_byte_ranges(
+            filepath_or_buffer, row_groups, columns, fs,
+        )
+
     filepaths_or_buffers = []
-    for source in filepath_or_buffer:
+    for i, source in enumerate(filepath_or_buffer):
+
         if ioutils.is_directory(source, **kwargs):
-            fs = ioutils._ensure_filesystem(
-                passed_filesystem=None, path=source
+            # Note: For now, we know `fs` is an fsspec filesystem
+            # object, but it may be an arrow object in the future
+            fsspec_fs = ioutils._ensure_filesystem(
+                passed_filesystem=fs, path=source
             )
             source = ioutils.stringify_pathlike(source)
-            source = fs.sep.join([source, "*.parquet"])
+            source = fsspec_fs.sep.join([source, "*.parquet"])
 
         tmp_source, compression = ioutils.get_filepath_or_buffer(
-            path_or_data=source, compression=None, **kwargs,
+            path_or_data=source,
+            compression=None,
+            fs=fs,
+            byte_ranges=byte_ranges[i] if byte_ranges else None,
+            footer=footers[i] if footers else None,
+            file_size=file_sizes[i] if file_sizes else None,
+            add_par1_magic=True,
+            **kwargs,
         )
+
         if compression is not None:
             raise ValueError(
                 "URL content-encoding decompression is not supported"
@@ -210,39 +383,6 @@ def read_parquet(
         else:
             filepaths_or_buffers.append(tmp_source)
 
-    if columns is not None:
-        if not is_list_like(columns):
-            raise ValueError("Expected list like for columns")
-
-    if filters is not None:
-        # Convert filters to ds.Expression
-        filters = pq._filters_to_expression(filters)
-
-        # Initialize ds.FilesystemDataset
-        dataset = ds.dataset(
-            filepaths_or_buffers, format="parquet", partitioning="hive"
-        )
-
-        # Load IDs of filtered row groups for each file in dataset
-        filtered_rg_ids = defaultdict(list)
-        for fragment in dataset.get_fragments(filter=filters):
-            for rg_fragment in fragment.split_by_row_group(filters):
-                for rg_info in rg_fragment.row_groups:
-                    filtered_rg_ids[rg_fragment.path].append(rg_info.id)
-
-        # Initialize row_groups to be selected
-        if row_groups is None:
-            row_groups = [None for _ in dataset.files]
-
-        # Store IDs of selected row groups for each file
-        for i, file in enumerate(dataset.files):
-            if row_groups[i] is None:
-                row_groups[i] = filtered_rg_ids[file]
-            else:
-                row_groups[i] = filter(
-                    lambda id: id in row_groups[i], filtered_rg_ids[file]
-                )
-
     if engine == "cudf":
         return libparquet.read_parquet(
             filepaths_or_buffers,
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 99d79e41520..03cd6c6f5cb 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -34,10 +34,14 @@ def test_read_csv(pdf, monkeypatch):
     fpath = TEST_BUCKET + "test_csv_reader.csv"
     buffer = pdf.to_csv(index=False)
 
-    def mock_open(*args):
+    def mock_open(*args, **kwargs):
         return io.BytesIO(buffer.encode())
 
+    def mock_size(*args):
+        return len(buffer.encode())
+
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open)
+    monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size)
     got = cudf.read_csv("gcs://{}".format(fpath))
 
     assert_eq(pdf, got)
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 133597b8f19..11ed68056b6 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -122,19 +122,41 @@ def pdf(scope="module"):
     return df
 
 
-def test_read_csv(s3_base, s3so, pdf):
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     # Write to buffer
     fname = "test_csv_reader.csv"
     bname = "csv"
     buffer = pdf.to_csv(index=False)
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_csv(
-            "s3://{}/{}".format(bname, fname), storage_options=s3so
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
         )
 
     assert_eq(pdf, got)
 
 
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread):
+    # Write to buffer
+    fname = "test_csv_reader_byte_range.csv"
+    bname = "csv"
+    buffer = pdf.to_csv(index=False)
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        got = cudf.read_csv(
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            byte_range=(74, 73),
+            bytes_per_thread=bytes_per_thread,
+            header=False,
+            names=["Integer", "Float", "Integer2", "String", "Boolean"],
+        )
+
+    assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
+
+
 @pytest.mark.parametrize("chunksize", [None, 3])
 def test_write_csv(s3_base, s3so, pdf, chunksize):
     # Write to buffer
@@ -156,7 +178,9 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
     assert_eq(pdf, got)
 
 
-def test_read_parquet(s3_base, s3so, pdf):
+@pytest.mark.parametrize("bytes_per_thread", [32, 1024])
+@pytest.mark.parametrize("columns", [None, ["Float", "String"]])
+def test_read_parquet(s3_base, s3so, pdf, bytes_per_thread, columns):
     fname = "test_parquet_reader.parquet"
     bname = "parquet"
     buffer = BytesIO()
@@ -164,10 +188,32 @@ def test_read_parquet(s3_base, s3so, pdf):
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_parquet(
-            "s3://{}/{}".format(bname, fname), storage_options=s3so
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
+            columns=columns,
         )
 
-    assert_eq(pdf, got)
+    expect = pdf[columns] if columns else pdf
+    assert_eq(expect, got)
+
+
+def test_read_parquet_filters(s3_base, s3so, pdf):
+    fname = "test_parquet_reader_filters.parquet"
+    bname = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    filters = [("String", "==", "Omega")]
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        got = cudf.read_parquet(
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            filters=filters,
+        )
+
+    # All row-groups should be filtered out
+    assert_eq(pdf.iloc[:0], got.reset_index(drop=True))
 
 
 def test_write_parquet(s3_base, s3so, pdf):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 15cf50af817..4bffd06c4cc 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -4,9 +4,11 @@
 import os
 import urllib
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
+from threading import Thread
 
 import fsspec
 import fsspec.implementations.local
+import numpy as np
 import pandas as pd
 from fsspec.core import get_fs_token_paths
 
@@ -1129,8 +1131,51 @@ def is_directory(path_or_data, **kwargs):
     return False
 
 
+def _get_filesystem_and_paths(path_or_data, **kwargs):
+    # Returns a filesystem object and the filesystem-normalized
+    # paths. If `path_or_data` does not correspond to a path or
+    # list of paths (or if the protocol is not supported), the
+    # return will be `None` for the fs and `[]` for the paths.
+
+    fs = None
+    return_paths = path_or_data
+    if isinstance(path_or_data, str) or (
+        isinstance(path_or_data, list)
+        and isinstance(stringify_pathlike(path_or_data[0]), str)
+    ):
+        # Ensure we are always working with a list
+        storage_options = kwargs.get("storage_options")
+        if isinstance(path_or_data, list):
+            path_or_data = [
+                os.path.expanduser(stringify_pathlike(source))
+                for source in path_or_data
+            ]
+        else:
+            path_or_data = [path_or_data]
+
+        # Pyarrow did not support the protocol or storage options.
+        # Fall back to fsspec
+        try:
+            fs, _, fs_paths = fsspec.get_fs_token_paths(
+                path_or_data, mode="rb", storage_options=storage_options
+            )
+            return_paths = fs_paths
+        except ValueError as e:
+            if str(e).startswith("Protocol not known"):
+                return None, []
+            else:
+                raise e
+
+    return fs, return_paths
+
+
 def get_filepath_or_buffer(
-    path_or_data, compression, mode="rb", iotypes=(BytesIO), **kwargs,
+    path_or_data,
+    compression,
+    mode="rb",
+    fs=None,
+    iotypes=(BytesIO,),
+    **kwargs,
 ):
     """Return either a filepath string to data, or a memory buffer of data.
     If filepath, then the source filepath is expanded to user's environment.
@@ -1158,19 +1203,13 @@ def get_filepath_or_buffer(
     path_or_data = stringify_pathlike(path_or_data)
 
     if isinstance(path_or_data, str):
-        storage_options = kwargs.get("storage_options")
-        # fsspec does not expanduser so handle here
-        path_or_data = os.path.expanduser(path_or_data)
 
-        try:
-            fs, _, paths = fsspec.get_fs_token_paths(
-                path_or_data, mode=mode, storage_options=storage_options
-            )
-        except ValueError as e:
-            if str(e).startswith("Protocol not known"):
+        # Get a filesystem object if one isn't already available
+        paths = [path_or_data]
+        if fs is None:
+            fs, paths = _get_filesystem_and_paths(path_or_data, **kwargs)
+            if fs is None:
                 return path_or_data, compression
-            else:
-                raise e
 
         if len(paths) == 0:
             raise FileNotFoundError(
@@ -1184,14 +1223,21 @@ def get_filepath_or_buffer(
                 path_or_data = paths if len(paths) > 1 else paths[0]
 
         else:
-            path_or_data = [BytesIO(fs.open(fpath).read()) for fpath in paths]
+            path_or_data = [
+                BytesIO(
+                    _fsspec_data_transfer(fpath, fs=fs, mode=mode, **kwargs)
+                )
+                for fpath in paths
+            ]
             if len(path_or_data) == 1:
                 path_or_data = path_or_data[0]
 
     elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
         if isinstance(path_or_data, TextIOWrapper):
             path_or_data = path_or_data.buffer
-        path_or_data = BytesIO(path_or_data.read())
+        path_or_data = BytesIO(
+            _fsspec_data_transfer(path_or_data, mode=mode, **kwargs)
+        )
 
     return path_or_data, compression
 
@@ -1413,3 +1459,148 @@ def _ensure_filesystem(passed_filesystem, path):
             0
         ]
     return passed_filesystem
+
+
+#
+# Fsspec Data-transfer Optimization Code
+#
+
+
+def _fsspec_data_transfer(
+    path_or_fob,
+    fs=None,
+    byte_ranges=None,
+    footer=None,
+    file_size=None,
+    add_par1_magic=None,
+    bytes_per_thread=256_000_000,
+    max_gap=64_000,
+    mode="rb",
+    clip_local_buffer=False,
+    **kwargs,
+):
+
+    # Require `fs` if `path_or_fob` is not file-like
+    file_like = is_file_like(path_or_fob)
+    if fs is None and not file_like:
+        raise ValueError(
+            "fs must be defined if `path_or_fob` is not file-like"
+        )
+
+    # Calculate total file size
+    if file_like:
+        file_size = path_or_fob.size
+    file_size = file_size or fs.size(path_or_fob)
+
+    # Check if a direct read makes the most sense
+    if not byte_ranges and bytes_per_thread >= file_size:
+        if file_like:
+            return path_or_fob.read()
+        else:
+            return fs.open(path_or_fob, mode=mode, cache_type="none").read()
+
+    # Threaded read into "local" buffer
+    buf = np.zeros(file_size, dtype="b")
+    if byte_ranges:
+
+        # Optimize/merge the ranges
+        byte_ranges = _merge_ranges(
+            byte_ranges, max_block=bytes_per_thread, max_gap=max_gap,
+        )
+
+        # Call multi-threaded data transfer of
+        # remote byte-ranges to local buffer
+        _read_byte_ranges(
+            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
+        )
+
+        # Add Header & Footer bytes
+        if footer is not None:
+            footer_size = len(footer)
+            buf[-footer_size:] = np.frombuffer(
+                footer[-footer_size:], dtype="b"
+            )
+
+        # Add parquet magic bytes (optional)
+        if add_par1_magic:
+            buf[:4] = np.frombuffer(b"PAR1", dtype="b")
+            if footer is None:
+                buf[-4:] = np.frombuffer(b"PAR1", dtype="b")
+
+    else:
+        byte_ranges = [
+            (b, min(bytes_per_thread, file_size - b))
+            for b in range(0, file_size, bytes_per_thread)
+        ]
+        _read_byte_ranges(
+            path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
+        )
+
+    if clip_local_buffer:
+        # If we only need the populated byte range
+        # (e.g. a csv byte-range read) then clip parts
+        # of the local buffer that are outside this range
+        start = byte_ranges[0][0]
+        end = byte_ranges[-1][0] + byte_ranges[-1][1]
+        return buf[start:end].tobytes()
+
+    return buf.tobytes()
+
+
+def _merge_ranges(byte_ranges, max_block=256_000_000, max_gap=64_000):
+    # Simple utility to merge small/adjacent byte ranges
+    new_ranges = []
+    if not byte_ranges:
+        # Early return
+        return new_ranges
+
+    offset, size = byte_ranges[0]
+    for (new_offset, new_size) in byte_ranges[1:]:
+        gap = new_offset - (offset + size)
+        if gap > max_gap or (size + new_size + gap) > max_block:
+            # Gap is too large or total read is too large
+            new_ranges.append((offset, size))
+            offset = new_offset
+            size = new_size
+            continue
+        size += new_size + gap
+    new_ranges.append((offset, size))
+    return new_ranges
+
+
+def _assign_block(fs, path_or_fob, local_buffer, offset, nbytes):
+    if fs is None:
+        # We have an open fsspec file object
+        path_or_fob.seek(offset)
+        local_buffer[offset : offset + nbytes] = np.frombuffer(
+            path_or_fob.read(nbytes), dtype="b",
+        )
+    else:
+        # We have an fsspec filesystem and a path
+        with fs.open(path_or_fob, mode="rb", cache_type="none") as fob:
+            fob.seek(offset)
+            local_buffer[offset : offset + nbytes] = np.frombuffer(
+                fob.read(nbytes), dtype="b",
+            )
+
+
+def _read_byte_ranges(
+    path_or_fob, ranges, local_buffer, fs=None, **kwargs,
+):
+    # Simple utility to copy remote byte ranges
+    # into a local buffer for IO in libcudf
+    workers = []
+    for (offset, nbytes) in ranges:
+        if len(ranges) > 1:
+            workers.append(
+                Thread(
+                    target=_assign_block,
+                    args=(fs, path_or_fob, local_buffer, offset, nbytes),
+                )
+            )
+            workers[-1].start()
+        else:
+            _assign_block(fs, path_or_fob, local_buffer, offset, nbytes)
+
+    for worker in workers:
+        worker.join()
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 5028dce2b4e..850cc0843cc 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
 import warnings
+from contextlib import ExitStack
 from functools import partial
 from io import BufferedWriter, BytesIO, IOBase
 
@@ -67,15 +68,33 @@ def _read_paths(
         **kwargs,
     ):
 
-        # Use cudf to read in data
-        df = cudf.read_parquet(
-            paths,
-            engine="cudf",
-            columns=columns,
-            row_groups=row_groups if row_groups else None,
-            strings_to_categorical=strings_to_categorical,
-            **kwargs,
-        )
+        # Simplify row_groups if all None
+        if row_groups == [None for path in paths]:
+            row_groups = None
+
+        with ExitStack() as stack:
+
+            # Non-local filesystem handling
+            paths_or_fobs = paths
+            if not cudf.utils.ioutils._is_local_filesystem(fs):
+
+                # Convert paths to file objects for remote data
+                paths_or_fobs = [
+                    stack.enter_context(
+                        fs.open(path, mode="rb", cache_type="none")
+                    )
+                    for path in paths
+                ]
+
+            # Use cudf to read in data
+            df = cudf.read_parquet(
+                paths_or_fobs,
+                engine="cudf",
+                columns=columns,
+                row_groups=row_groups if row_groups else None,
+                strings_to_categorical=strings_to_categorical,
+                **kwargs,
+            )
 
         if partitions and partition_keys is None:
 
@@ -134,101 +153,76 @@ def read_partition(
         categories=(),
         partitions=(),
         partitioning=None,
+        schema=None,
         **kwargs,
     ):
+
         if columns is not None:
             columns = [c for c in columns]
         if isinstance(index, list):
             columns += index
 
+        # Check if we are actually selecting any columns
+        read_columns = columns
+        if schema and columns:
+            ignored = set(schema.names) - set(columns)
+            if not ignored:
+                read_columns = None
+
         if not isinstance(pieces, list):
             pieces = [pieces]
 
         strings_to_cats = kwargs.get("strings_to_categorical", False)
 
-        if len(pieces) > 1:
-
-            # Multi-peice read
-            paths = []
-            rgs = []
-            last_partition_keys = None
-            dfs = []
-
-            for i, piece in enumerate(pieces):
-
-                (path, row_group, partition_keys) = piece
-                row_group = None if row_group == [None] else row_group
-
-                if i > 0 and partition_keys != last_partition_keys:
-                    dfs.append(
-                        cls._read_paths(
-                            paths,
-                            fs,
-                            columns=columns,
-                            row_groups=rgs if rgs else None,
-                            strings_to_categorical=strings_to_cats,
-                            partitions=partitions,
-                            partitioning=partitioning,
-                            partition_keys=last_partition_keys,
-                            **kwargs.get("read", {}),
-                        )
-                    )
-                    paths = rgs = []
-                    last_partition_keys = None
-                paths.append(path)
-                rgs.append(
-                    [row_group]
-                    if not isinstance(row_group, list)
-                    else row_group
-                )
-                last_partition_keys = partition_keys
-
-            dfs.append(
-                cls._read_paths(
-                    paths,
-                    fs,
-                    columns=columns,
-                    row_groups=rgs if rgs else None,
-                    strings_to_categorical=strings_to_cats,
-                    partitions=partitions,
-                    partitioning=partitioning,
-                    partition_keys=last_partition_keys,
-                    **kwargs.get("read", {}),
-                )
-            )
-            df = cudf.concat(dfs)
+        # Assume multi-peice read
+        paths = []
+        rgs = []
+        last_partition_keys = None
+        dfs = []
 
-        else:
+        for i, piece in enumerate(pieces):
 
-            # Single-piece read
-            (path, row_group, partition_keys) = pieces[0]
+            (path, row_group, partition_keys) = piece
             row_group = None if row_group == [None] else row_group
 
-            if cudf.utils.ioutils._is_local_filesystem(fs):
-                df = cls._read_paths(
-                    path,
-                    fs,
-                    columns=columns,
-                    row_groups=row_group,
-                    strings_to_categorical=strings_to_cats,
-                    partitions=partitions,
-                    partitioning=partitioning,
-                    partition_keys=partition_keys,
-                    **kwargs.get("read", {}),
-                )
-            else:
-                with fs.open(path, mode="rb") as f:
-                    df = cls._read_paths(
-                        f,
+            if i > 0 and partition_keys != last_partition_keys:
+                dfs.append(
+                    cls._read_paths(
+                        paths,
                         fs,
-                        columns=columns,
-                        row_groups=row_group,
+                        columns=read_columns,
+                        row_groups=rgs if rgs else None,
                         strings_to_categorical=strings_to_cats,
                         partitions=partitions,
                         partitioning=partitioning,
-                        partition_keys=partition_keys,
+                        partition_keys=last_partition_keys,
                         **kwargs.get("read", {}),
                     )
+                )
+                paths = rgs = []
+                last_partition_keys = None
+            paths.append(path)
+            rgs.append(
+                [row_group]
+                if not isinstance(row_group, list) and row_group is not None
+                else row_group
+            )
+            last_partition_keys = partition_keys
+
+        dfs.append(
+            cls._read_paths(
+                paths,
+                fs,
+                columns=read_columns,
+                row_groups=rgs if rgs else None,
+                strings_to_categorical=strings_to_cats,
+                partitions=partitions,
+                partitioning=partitioning,
+                partition_keys=last_partition_keys,
+                **kwargs.get("read", {}),
+            )
+        )
+        df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0]
 
         # Re-set "object" dtypes align with pa schema
         set_object_dtypes_from_pa_schema(df, kwargs.get("schema", None))

From 2c6b39bdd7a37e0aa8708ed4018d1ad360a4d104 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 22 Sep 2021 15:31:43 -0700
Subject: [PATCH 17/26] Add support for struct type in ORC writer (#9025)

Fixes #7830, #8443

Features:
- Use the new table metadata type that matches the table hierarchy, `table_input_metadata`.
- Support struct columns in the writer.

Changes:
- Null masks are encoded as aligned rowgroups to avoid invalid bits when the number of encoded rows is not divisible by 8 (except for the last rowgroup in each stripe). This also affects list columns. The issue is equivalent to https://github.com/rapidsai/cudf/issues/6763 (boolean columns only).
- Added pushdown masks that are used to determine which child elements should not be encoded, including null mask bits.
- Use pushdown masks for rowgroup alignment, null mask encoding and value encoding.
- Separated the null mask encoding from value encoding - can be further moved to a separate kernel call.

Breaking because the table metadata type has changed.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Devavret Makkar (https://github.com/devavret)
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

URL: https://github.com/rapidsai/cudf/pull/9025
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/orc.hpp                   |  19 +-
 cpp/include/cudf/io/parquet.hpp               | 169 ------
 cpp/include/cudf/io/types.hpp                 | 199 ++++++-
 cpp/include/cudf/utilities/bit.hpp            |  18 +
 .../cudf_test/io_metadata_utilities.hpp       |  25 +
 cpp/src/io/orc/dict_enc.cu                    |   2 +-
 cpp/src/io/orc/orc.h                          |   8 +-
 cpp/src/io/orc/orc_gpu.h                      |  33 +-
 cpp/src/io/orc/stripe_enc.cu                  | 267 ++++-----
 cpp/src/io/orc/stripe_init.cu                 |  64 ++-
 cpp/src/io/orc/writer_impl.cu                 | 510 ++++++++++++++----
 cpp/src/io/orc/writer_impl.hpp                |  24 +-
 cpp/tests/io/metadata_utilities.cpp           |  42 ++
 cpp/tests/io/orc_test.cpp                     | 366 ++++++-------
 cpp/tests/io/parquet_test.cpp                 |  52 +-
 .../java/ai/rapids/cudf/ORCWriterOptions.java |   4 +-
 java/src/main/java/ai/rapids/cudf/Table.java  |   7 +-
 java/src/main/native/src/TableJni.cpp         |  57 +-
 .../test/java/ai/rapids/cudf/TableTest.java   |  13 +-
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |  12 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |  31 +-
 python/cudf/cudf/_lib/cpp/io/types.pxd        |  27 +-
 python/cudf/cudf/_lib/orc.pyx                 |  86 ++-
 python/cudf/cudf/_lib/parquet.pyx             |   3 +-
 python/cudf/cudf/io/orc.py                    |   8 +-
 python/cudf/cudf/tests/test_orc.py            |  56 +-
 python/cudf/cudf/utils/ioutils.py             |   6 +
 29 files changed, 1313 insertions(+), 797 deletions(-)
 create mode 100644 cpp/include/cudf_test/io_metadata_utilities.hpp
 create mode 100644 cpp/tests/io/metadata_utilities.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 0f05dcb4bb3..c3450fe8d88 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -238,6 +238,7 @@ test:
     - test -f $PREFIX/include/cudf_test/cudf_gtest.hpp
     - test -f $PREFIX/include/cudf_test/cxxopts.hpp
     - test -f $PREFIX/include/cudf_test/file_utilities.hpp
+    - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
     - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
     - test -f $PREFIX/include/cudf_test/table_utilities.hpp
     - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c72c258fd18..2df35aa0971 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -565,6 +565,7 @@ add_library(cudftestutil STATIC
             tests/utilities/base_fixture.cpp
             tests/utilities/column_utilities.cu
             tests/utilities/table_utilities.cu
+            tests/io/metadata_utilities.cpp
             tests/strings/utilities.cu)
 
 set_target_properties(cudftestutil
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 4ae09b516a4..17d8e5eb7dd 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -389,7 +389,7 @@ class orc_writer_options {
   // Set of columns to output
   table_view _table;
   // Optional associated metadata
-  const table_metadata* _metadata = nullptr;
+  const table_input_metadata* _metadata = nullptr;
 
   friend orc_writer_options_builder;
 
@@ -445,7 +445,7 @@ class orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   // Setters
 
@@ -475,7 +475,7 @@ class orc_writer_options {
    *
    * @param meta Associated metadata.
    */
-  void set_metadata(table_metadata* meta) { _metadata = meta; }
+  void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
 };
 
 class orc_writer_options_builder {
@@ -541,7 +541,7 @@ class orc_writer_options_builder {
    * @param meta Associated metadata.
    * @return this for chaining.
    */
-  orc_writer_options_builder& metadata(table_metadata* meta)
+  orc_writer_options_builder& metadata(table_input_metadata const* meta)
   {
     options._metadata = meta;
     return *this;
@@ -570,6 +570,9 @@ class orc_writer_options_builder {
  *  cudf::io::write_orc(options);
  * @endcode
  *
+ * Note: Support for writing tables with struct columns is currently experimental, the output may
+ * not be as reliable as writing for other datatypes.
+ *
  * @param options Settings for controlling reading behavior.
  * @param mr Device memory resource to use for device memory allocation.
  */
@@ -592,7 +595,7 @@ class chunked_orc_writer_options {
   // Enable writing column statistics
   bool _enable_statistics = true;
   // Optional associated metadata
-  const table_metadata_with_nullability* _metadata = nullptr;
+  const table_input_metadata* _metadata = nullptr;
 
   friend chunked_orc_writer_options_builder;
 
@@ -638,7 +641,7 @@ class chunked_orc_writer_options {
   /**
    * @brief Returns associated metadata.
    */
-  table_metadata_with_nullability const* get_metadata() const { return _metadata; }
+  table_input_metadata const* get_metadata() const { return _metadata; }
 
   // Setters
 
@@ -661,7 +664,7 @@ class chunked_orc_writer_options {
    *
    * @param meta Associated metadata.
    */
-  void metadata(table_metadata_with_nullability* meta) { _metadata = meta; }
+  void metadata(table_input_metadata const* meta) { _metadata = meta; }
 };
 
 class chunked_orc_writer_options_builder {
@@ -712,7 +715,7 @@ class chunked_orc_writer_options_builder {
    * @param meta Associated metadata.
    * @return this for chaining.
    */
-  chunked_orc_writer_options_builder& metadata(table_metadata_with_nullability* meta)
+  chunked_orc_writer_options_builder& metadata(table_input_metadata const* meta)
   {
     options._metadata = meta;
     return *this;
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 25cbb6fd554..bc495c61d54 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -24,8 +24,6 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include <thrust/optional.h>
-
 #include <iostream>
 #include <memory>
 #include <string>
@@ -375,173 +373,6 @@ table_with_metadata read_parquet(
  * @{
  * @file
  */
-class table_input_metadata;
-
-class column_in_metadata {
-  friend table_input_metadata;
-  std::string _name = "";
-  thrust::optional<bool> _nullable;
-  // TODO: This isn't implemented yet
-  bool _list_column_is_map  = false;
-  bool _use_int96_timestamp = false;
-  // bool _output_as_binary = false;
-  thrust::optional<uint8_t> _decimal_precision;
-  std::vector<column_in_metadata> children;
-
- public:
-  /**
-   * @brief Get the children of this column metadata
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& add_child(column_in_metadata const& child)
-  {
-    children.push_back(child);
-    return *this;
-  }
-
-  /**
-   * @brief Set the name of this column
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& set_name(std::string const& name)
-  {
-    _name = name;
-    return *this;
-  }
-
-  /**
-   * @brief Set the nullability of this column
-   *
-   * Only valid in case of chunked writes. In single writes, this option is ignored.
-   *
-   * @return column_in_metadata&
-   */
-  column_in_metadata& set_nullability(bool nullable)
-  {
-    _nullable = nullable;
-    return *this;
-  }
-
-  /**
-   * @brief Specify that this list column should be encoded as a map in the written parquet file
-   *
-   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
-   *
-   * @return this for chaining
-   */
-  column_in_metadata& set_list_column_as_map()
-  {
-    _list_column_is_map = true;
-    return *this;
-  }
-
-  /**
-   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
-   * physical type. Only valid for the following column types:
-   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
-   *
-   * @param req True = use int96 physical type. False = use int64 physical type
-   * @return this for chaining
-   */
-  column_in_metadata& set_int96_timestamps(bool req)
-  {
-    _use_int96_timestamp = req;
-    return *this;
-  }
-
-  /**
-   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
-   * (fixed-point) type
-   *
-   * @param precision The integer precision to set for this decimal column
-   * @return this for chaining
-   */
-  column_in_metadata& set_decimal_precision(uint8_t precision)
-  {
-    _decimal_precision = precision;
-    return *this;
-  }
-
-  /**
-   * @brief Get reference to a child of this column
-   *
-   * @param i Index of the child to get
-   * @return this for chaining
-   */
-  column_in_metadata& child(size_type i) { return children[i]; }
-
-  /**
-   * @brief Get const reference to a child of this column
-   *
-   * @param i Index of the child to get
-   * @return this for chaining
-   */
-  column_in_metadata const& child(size_type i) const { return children[i]; }
-
-  /**
-   * @brief Get the name of this column
-   */
-  std::string get_name() const { return _name; }
-
-  /**
-   * @brief Get whether nullability has been explicitly set for this column.
-   */
-  bool is_nullability_defined() const { return _nullable.has_value(); }
-
-  /**
-   * @brief Gets the explicitly set nullability for this column.
-   * @throws If nullability is not explicitly defined for this column.
-   *         Check using `is_nullability_defined()` first.
-   */
-  bool nullable() const { return _nullable.value(); }
-
-  /**
-   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
-   */
-  bool is_map() const { return _list_column_is_map; }
-
-  /**
-   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
-   */
-  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
-
-  /**
-   * @brief Get whether precision has been set for this decimal column
-   */
-  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
-
-  /**
-   * @brief Get the decimal precision that was set for this column.
-   * @throws If decimal precision was not set for this column.
-   *         Check using `is_decimal_precision_set()` first.
-   */
-  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
-
-  /**
-   * @brief Get the number of children of this column
-   */
-  size_type num_children() const { return children.size(); }
-};
-
-class table_input_metadata {
- public:
-  table_input_metadata() = default;  // Required by cython
-
-  /**
-   * @brief Construct a new table_input_metadata from a table_view.
-   *
-   * The constructed table_input_metadata has the same structure as the passed table_view
-   *
-   * @param table The table_view to construct metadata for
-   * @param user_data Optional Additional metadata to encode, as key-value pairs
-   */
-  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
-
-  std::vector<column_in_metadata> column_metadata;
-  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
-};
 
 /**
  * @brief Class to build `parquet_writer_options`.
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 661b36f10c8..ac965e2d416 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -23,6 +23,8 @@
 
 #include <cudf/types.hpp>
 
+#include <thrust/optional.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -125,34 +127,6 @@ struct table_metadata {
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
 };
 
-/**
- * @brief Derived class of table_metadata which includes flattened nullability information of input.
- *
- * This information is used as an optimization for chunked writes. If the caller leaves
- * column_nullable uninitialized, the writer code will assume the worst case : that all columns are
- * nullable.
- *
- * If the column_nullable field is not empty, it is expected that it has a length equal to the
- * number of columns in the flattened table being written.
- *
- * Flattening refers to the flattening of nested columns. For list columns, the number of values
- * expected in the nullability vector is equal to the depth of the nesting. e.g. for a table of
- * three columns of types: {int, list<double>, float}, the nullability vector contains the values:
- *
- * |Index| Nullability of                         |
- * |-----|----------------------------------------|
- * |  0  | int column                             |
- * |  1  | Level 0 of list column (list itself)   |
- * |  2  | Level 1 of list column (double values) |
- * |  3  | float column                           |
- *
- * In the case where column nullability is known, pass `true` if the corresponding column could
- * contain nulls in one or more subtables to be written, otherwise `false`.
- */
-struct table_metadata_with_nullability : public table_metadata {
-  std::vector<bool> column_nullable;  //!< Per-column nullability information.
-};
-
 /**
  * @brief Table with table metadata used by io readers to return the metadata by value
  */
@@ -234,5 +208,174 @@ struct sink_info {
   }
 };
 
+class table_input_metadata;
+
+class column_in_metadata {
+  friend table_input_metadata;
+  std::string _name = "";
+  thrust::optional<bool> _nullable;
+  bool _list_column_is_map  = false;
+  bool _use_int96_timestamp = false;
+  // bool _output_as_binary = false;
+  thrust::optional<uint8_t> _decimal_precision;
+  std::vector<column_in_metadata> children;
+
+ public:
+  column_in_metadata() = default;
+  column_in_metadata(std::string_view name) : _name{name} {}
+  /**
+   * @brief Get the children of this column metadata
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& add_child(column_in_metadata const& child)
+  {
+    children.push_back(child);
+    return *this;
+  }
+
+  /**
+   * @brief Set the name of this column
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_name(std::string const& name)
+  {
+    _name = name;
+    return *this;
+  }
+
+  /**
+   * @brief Set the nullability of this column
+   *
+   * Only valid in case of chunked writes. In single writes, this option is ignored.
+   *
+   * @return column_in_metadata&
+   */
+  column_in_metadata& set_nullability(bool nullable)
+  {
+    _nullable = nullable;
+    return *this;
+  }
+
+  /**
+   * @brief Specify that this list column should be encoded as a map in the written parquet file
+   *
+   * The column must have the structure list<struct<key, value>>. This option is invalid otherwise
+   *
+   * @return this for chaining
+   */
+  column_in_metadata& set_list_column_as_map()
+  {
+    _list_column_is_map = true;
+    return *this;
+  }
+
+  /**
+   * @brief Specifies whether this timestamp column should be encoded using the deprecated int96
+   * physical type. Only valid for the following column types:
+   * timestamp_s, timestamp_ms, timestamp_us, timestamp_ns
+   *
+   * @param req True = use int96 physical type. False = use int64 physical type
+   * @return this for chaining
+   */
+  column_in_metadata& set_int96_timestamps(bool req)
+  {
+    _use_int96_timestamp = req;
+    return *this;
+  }
+
+  /**
+   * @brief Set the decimal precision of this column. Only valid if this column is a decimal
+   * (fixed-point) type
+   *
+   * @param precision The integer precision to set for this decimal column
+   * @return this for chaining
+   */
+  column_in_metadata& set_decimal_precision(uint8_t precision)
+  {
+    _decimal_precision = precision;
+    return *this;
+  }
+
+  /**
+   * @brief Get reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata& child(size_type i) { return children[i]; }
+
+  /**
+   * @brief Get const reference to a child of this column
+   *
+   * @param i Index of the child to get
+   * @return this for chaining
+   */
+  column_in_metadata const& child(size_type i) const { return children[i]; }
+
+  /**
+   * @brief Get the name of this column
+   */
+  std::string get_name() const { return _name; }
+
+  /**
+   * @brief Get whether nullability has been explicitly set for this column.
+   */
+  bool is_nullability_defined() const { return _nullable.has_value(); }
+
+  /**
+   * @brief Gets the explicitly set nullability for this column.
+   * @throws If nullability is not explicitly defined for this column.
+   *         Check using `is_nullability_defined()` first.
+   */
+  bool nullable() const { return _nullable.value(); }
+
+  /**
+   * @brief If this is the metadata of a list column, returns whether it is to be encoded as a map.
+   */
+  bool is_map() const { return _list_column_is_map; }
+
+  /**
+   * @brief Get whether to encode this timestamp column using deprecated int96 physical type
+   */
+  bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+
+  /**
+   * @brief Get whether precision has been set for this decimal column
+   */
+  bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+
+  /**
+   * @brief Get the decimal precision that was set for this column.
+   * @throws If decimal precision was not set for this column.
+   *         Check using `is_decimal_precision_set()` first.
+   */
+  uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
+
+  /**
+   * @brief Get the number of children of this column
+   */
+  size_type num_children() const { return children.size(); }
+};
+
+class table_input_metadata {
+ public:
+  table_input_metadata() = default;  // Required by cython
+
+  /**
+   * @brief Construct a new table_input_metadata from a table_view.
+   *
+   * The constructed table_input_metadata has the same structure as the passed table_view
+   *
+   * @param table The table_view to construct metadata for
+   * @param user_data Optional Additional metadata to encode, as key-value pairs
+   */
+  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+
+  std::vector<column_in_metadata> column_metadata;
+  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
+
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index 458587946f2..cbd09fa7b0d 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -104,6 +104,7 @@ CUDA_HOST_DEVICE_CALLABLE void clear_bit_unsafe(bitmask_type* bitmask, size_type
 /**
  * @brief Indicates whether the specified bit is set to `1`
  *
+ * @param bitmask The bitmask containing the bit to clear
  * @param bit_index Index of the bit to test
  * @return true The specified bit is `1`
  * @return false  The specified bit is `0`
@@ -114,6 +115,23 @@ CUDA_HOST_DEVICE_CALLABLE bool bit_is_set(bitmask_type const* bitmask, size_type
   return bitmask[word_index(bit_index)] & (bitmask_type{1} << intra_word_index(bit_index));
 }
 
+/**
+ * @brief optional-like interface to check if a specified bit of a bitmask is set.
+ *
+ * @param bitmask The bitmask containing the bit to clear
+ * @param bit_index Index of the bit to test
+ * @param default_value Value to return if `bitmask` is nullptr
+ * @return true The specified bit is `1`
+ * @return false  The specified bit is `0`
+ * @return `default_value` if `bitmask` is nullptr
+ */
+CUDA_HOST_DEVICE_CALLABLE bool bit_value_or(bitmask_type const* bitmask,
+                                            size_type bit_index,
+                                            bool default_value)
+{
+  return bitmask != nullptr ? bit_is_set(bitmask, bit_index) : default_value;
+}
+
 /**
  * @brief Returns a bitmask word with the `n` least significant bits set.
  *
diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp
new file mode 100644
index 00000000000..6ca6eba6884
--- /dev/null
+++ b/cpp/include/cudf_test/io_metadata_utilities.hpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/io/types.hpp>
+
+namespace cudf::test {
+
+void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
+                           cudf::io::table_metadata out_meta);
+
+}
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index eeafd959f87..c9b6c6e9f91 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -146,7 +146,7 @@ __global__ void __launch_bounds__(block_size, 2)
 
   if (t == 0) {
     s->chunk             = chunks[group_id][str_col_idx];
-    s->chunk.leaf_column = &orc_columns[col_idx].cudf_column;
+    s->chunk.leaf_column = &orc_columns[col_idx];
     s->chunk.dict_data   = dict_data[str_col_idx].data() + rowgroup_bounds[group_id][col_idx].begin;
     s->chunk.dict_index  = dict_index[str_col_idx].data();
     s->chunk.start_row   = rowgroup_bounds[group_id][col_idx].begin;
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 77de0b0b286..405bf7c2ecc 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -615,9 +615,13 @@ class metadata {
 /**
  * @brief `column_device_view` and additional, ORC specific, information on the column.
  */
-struct orc_column_device_view {
-  column_device_view cudf_column;
+struct orc_column_device_view : public column_device_view {
+  __device__ orc_column_device_view(column_device_view col, thrust::optional<uint32_t> parent_idx)
+    : column_device_view{col}, parent_index{parent_idx}
+  {
+  }
   thrust::optional<uint32_t> parent_index;
+  bitmask_type const* pushdown_mask = nullptr;
 };
 
 /**
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 88d7e26b3b6..389895abc83 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -135,6 +135,8 @@ struct RowGroup {
 struct EncChunk {
   uint32_t start_row;                // start row of this chunk
   uint32_t num_rows;                 // number of rows in this chunk
+  uint32_t null_mask_start_row;      // adjusted to multiple of 8
+  uint32_t null_mask_num_rows;       // adjusted to multiple of 8
   ColumnEncodingKind encoding_kind;  // column encoding kind
   TypeKind type_kind;                // column data type
   uint8_t dtype_len;                 // data type length
@@ -142,7 +144,7 @@ struct EncChunk {
 
   uint32_t* dict_index;  // dictionary index from row index
   uint32_t* decimal_offsets;
-  column_device_view const* leaf_column;
+  orc_column_device_view const* column;
 };
 
 /**
@@ -182,7 +184,7 @@ struct DictionaryChunk {
   uint32_t num_dict_strings;  // number of strings in dictionary
   uint32_t dict_char_count;   // size of dictionary string data for this chunk
 
-  column_device_view const* leaf_column;  //!< Pointer to string column
+  orc_column_device_view const* leaf_column;  //!< Pointer to string column
 };
 
 /**
@@ -197,7 +199,7 @@ struct StripeDictionary {
   uint32_t num_strings;      // number of unique strings in the dictionary
   uint32_t dict_char_count;  // total size of dictionary string data
 
-  column_device_view const* leaf_column;  //!< Pointer to string column
+  orc_column_device_view const* leaf_column;  //!< Pointer to string column
 };
 
 constexpr uint32_t encode_block_size = 512;
@@ -326,17 +328,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes,
                               device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream);
 
-/**
- * @brief Set leaf column element of EncChunk
- *
- * @param[in] orc_columns Pre-order flattened device array of ORC column views
- * @param[in,out] chunks encoder chunk device array [column][rowgroup]
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- */
-void set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                       device_2dspan<EncChunk> chunks,
-                       rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for compacting chunked column data prior to compression
  *
@@ -440,6 +431,7 @@ void orc_init_statistics_buffersize(statistics_merge_group* groups,
  * @param[in,out] groups Statistics merge groups
  * @param[in,out] chunks Statistics data
  * @param[in] statistics_count Number of statistics buffers
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void orc_encode_statistics(uint8_t* blob_bfr,
                            statistics_merge_group* groups,
@@ -447,6 +439,19 @@ void orc_encode_statistics(uint8_t* blob_bfr,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream);
 
+/**
+ * @brief Number of set bits in pushdown masks, per rowgroup.
+ *
+ * @param[in] orc_columns Pre-order flattened device array of ORC column views
+ * @param[in] rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
+ * @param[out] set_counts Per rowgroup number of set bits
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ */
+void reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
+                           device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                           device_2dspan<cudf::size_type> set_counts,
+                           rmm::cuda_stream_view stream);
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 9348d817dad..cc7e22f2042 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -265,7 +265,6 @@ static __device__ uint32_t ByteRLE(
     }
   }
   if (!t) { s->strm_pos[cid] = static_cast<uint32_t>(dst - s->stream.data_ptrs[cid]); }
-  __syncthreads();
   return out_cnt;
 }
 
@@ -621,6 +620,100 @@ inline __device__ void lengths_to_positions(volatile T* vals, uint32_t numvals,
 static const __device__ __constant__ int32_t kTimeScale[10] = {
   1000000000, 100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1};
 
+template <int block_size, typename Storage>
+static __device__ void encode_null_mask(orcenc_state_s* s,
+                                        bitmask_type const* pushdown_mask,
+                                        Storage& scan_storage,
+                                        int t)
+{
+  if (s->stream.ids[CI_PRESENT] < 0) return;
+
+  auto const column = *s->chunk.column;
+  while (s->present_rows < s->chunk.null_mask_num_rows or s->numvals > 0) {
+    // Number of rows read so far
+    auto present_rows = s->present_rows;
+    // valid_buf capacity is byte per thread in block
+    auto const buf_available_bits = encode_block_size * 8 - s->numvals;
+    // Number of rows for the block to process in this iteration
+    auto const nrows = min(s->chunk.null_mask_num_rows - present_rows, buf_available_bits);
+    // Number of rows for this thread to process in this iteration
+    auto const t_nrows = min(max(static_cast<int32_t>(nrows) - t * 8, 0), 8);
+    auto const row     = s->chunk.null_mask_start_row + present_rows + t * 8;
+
+    auto get_mask_byte = [&](bitmask_type const* mask, size_type offset) -> uint8_t {
+      if (t_nrows == 0) return 0;
+      if (mask == nullptr) return 0xff;
+
+      auto const begin_offset = row + offset;
+      auto const end_offset   = min(begin_offset + 8, offset + column.size());
+      auto const mask_word = cudf::detail::get_mask_offset_word(mask, 0, begin_offset, end_offset);
+      return mask_word & 0xff;
+    };
+
+    uint8_t pd_byte     = (1 << t_nrows) - 1;
+    uint32_t pd_set_cnt = t_nrows;
+    uint32_t offset     = t_nrows != 0 ? t * 8 : nrows;
+    if (pushdown_mask != nullptr) {
+      pd_byte    = get_mask_byte(pushdown_mask, 0) & ((1 << t_nrows) - 1);
+      pd_set_cnt = __popc(pd_byte);
+      // Scan the number of valid bits to get dst offset for each thread
+      cub::BlockScan<uint32_t, block_size>(scan_storage).ExclusiveSum(pd_set_cnt, offset);
+    }
+
+    auto const mask_byte = get_mask_byte(column.null_mask(), column.offset());
+    auto dst_offset      = offset + s->nnz;
+    auto vbuf_bit_idx    = [](int row) {
+      // valid_buf is a circular buffer with validitiy of 8 rows in each element
+      return row % (encode_block_size * 8);
+    };
+    if (dst_offset % 8 == 0 and pd_set_cnt == 8) {
+      s->valid_buf[vbuf_bit_idx(dst_offset) / 8] = mask_byte;
+    } else {
+      for (auto bit_idx = 0; bit_idx < t_nrows; ++bit_idx) {
+        // skip bits where pushdown mask is not set
+        if (not(pd_byte & (1 << bit_idx))) continue;
+        if (mask_byte & (1 << bit_idx)) {
+          set_bit(reinterpret_cast<uint32_t*>(s->valid_buf), vbuf_bit_idx(dst_offset++));
+        } else {
+          clear_bit(reinterpret_cast<uint32_t*>(s->valid_buf), vbuf_bit_idx(dst_offset++));
+        }
+      }
+    }
+
+    __syncthreads();
+    if (t == block_size - 1) {
+      // Number of loaded rows, available for encode
+      s->numvals += offset + pd_set_cnt;
+      // Number of loaded rows (different from present_rows because of pushdown masks)
+      s->nnz += offset + pd_set_cnt;
+    }
+    present_rows += nrows;
+    if (!t) { s->present_rows = present_rows; }
+    __syncthreads();
+
+    // RLE encode the present stream
+    if (s->numvals > ((present_rows < s->chunk.null_mask_num_rows) ? 130 * 8 : 0)) {
+      auto const flush      = (present_rows < s->chunk.null_mask_num_rows) ? 0 : 7;
+      auto const nbytes_out = (s->numvals + flush) / 8;
+      auto const nrows_encoded =
+        ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, s->present_out / 8, nbytes_out, flush, t) * 8;
+
+      if (!t) {
+        // Number of rows enocoded so far
+        s->present_out += nrows_encoded;
+        s->numvals -= min(s->numvals, nrows_encoded);
+      }
+      __syncthreads();
+    }
+  }
+
+  // reset shared state
+  if (t == 0) {
+    s->nnz     = 0;
+    s->numvals = 0;
+  }
+}
+
 /**
  * @brief Encode column data
  *
@@ -635,6 +728,7 @@ __global__ void __launch_bounds__(block_size)
 {
   __shared__ __align__(16) orcenc_state_s state_g;
   __shared__ union {
+    typename cub::BlockScan<uint32_t, block_size>::TempStorage scan_u32;
     typename cub::BlockReduce<int32_t, block_size>::TempStorage i32;
     typename cub::BlockReduce<int64_t, block_size>::TempStorage i64;
     typename cub::BlockReduce<uint32_t, block_size>::TempStorage u32;
@@ -646,120 +740,74 @@ __global__ void __launch_bounds__(block_size)
   uint32_t group_id       = blockIdx.y;
   int t                   = threadIdx.x;
   if (t == 0) {
-    s->chunk  = chunks[col_id][group_id];
-    s->stream = streams[col_id][group_id];
-  }
-  if (t < CI_NUM_STREAMS) { s->strm_pos[t] = 0; }
-  __syncthreads();
-  if (!t) {
-    s->cur_row      = 0;
-    s->present_rows = 0;
-    s->present_out  = 0;
-    s->numvals      = 0;
-    s->numlengths   = 0;
-    s->nnz          = 0;
+    s->chunk                = chunks[col_id][group_id];
+    s->stream               = streams[col_id][group_id];
+    s->cur_row              = 0;
+    s->present_rows         = 0;
+    s->present_out          = 0;
+    s->numvals              = 0;
+    s->numlengths           = 0;
+    s->nnz                  = 0;
+    s->strm_pos[CI_DATA]    = 0;
+    s->strm_pos[CI_PRESENT] = 0;
+    s->strm_pos[CI_INDEX]   = 0;
     // Dictionary data is encoded in a separate kernel
-    if (s->chunk.encoding_kind == DICTIONARY_V2) {
-      s->strm_pos[CI_DATA2]      = s->stream.lengths[CI_DATA2];
-      s->strm_pos[CI_DICTIONARY] = s->stream.lengths[CI_DICTIONARY];
-    }
+    s->strm_pos[CI_DATA2] =
+      s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DATA2] : 0;
+    s->strm_pos[CI_DICTIONARY] =
+      s->chunk.encoding_kind == DICTIONARY_V2 ? s->stream.lengths[CI_DICTIONARY] : 0;
   }
+  __syncthreads();
 
-  auto validity_byte = [&] __device__(int row) -> uint8_t& {
-    // valid_buf is a circular buffer where validitiy of 8 rows is stored in each element
-    return s->valid_buf[(row / 8) % encode_block_size];
-  };
-
-  auto validity = [&] __device__(int row) -> uint32_t {
-    // Check if the specific bit is set in the validity buffer
-    return (validity_byte(row) >> (row % 8)) & 1;
-  };
+  auto const pushdown_mask = [&]() -> cudf::bitmask_type const* {
+    auto const parent_index = s->chunk.column->parent_index;
+    if (!parent_index.has_value()) return nullptr;
+    return chunks[parent_index.value()][0].column->pushdown_mask;
+  }();
 
+  encode_null_mask<block_size>(s, pushdown_mask, temp_storage.scan_u32, t);
   __syncthreads();
+
+  auto const column = *s->chunk.column;
   while (s->cur_row < s->chunk.num_rows || s->numvals + s->numlengths != 0) {
-    // Encode valid map
-    if (s->present_rows < s->chunk.num_rows) {
-      uint32_t present_rows = s->present_rows;
-      uint32_t nrows =
-        min(s->chunk.num_rows - present_rows,
-            encode_block_size * 8 - (present_rows - (min(s->cur_row, s->present_out) & ~7)));
-      uint32_t nrows_out;
-      if (t * 8 < nrows) {
-        auto const row_in_group = present_rows + t * 8;
-        auto const row          = s->chunk.start_row + row_in_group;
-        uint8_t valid           = 0;
-        if (row < s->chunk.leaf_column->size()) {
-          if (s->chunk.leaf_column->nullable()) {
-            auto const current_valid_offset = row + s->chunk.leaf_column->offset();
-            auto const last_offset =
-              min(current_valid_offset + 8,
-                  s->chunk.leaf_column->offset() + s->chunk.leaf_column->size());
-            auto const mask = cudf::detail::get_mask_offset_word(
-              s->chunk.leaf_column->null_mask(), 0, current_valid_offset, last_offset);
-            valid = 0xff & mask;
-          } else {
-            valid = 0xff;
-          }
-          if (row + 7 > s->chunk.leaf_column->size()) {
-            valid = valid & ((1 << (s->chunk.leaf_column->size() - row)) - 1);
-          }
-        }
-        validity_byte(row_in_group) = valid;
-      }
-      __syncthreads();
-      present_rows += nrows;
-      if (!t) { s->present_rows = present_rows; }
-      // RLE encode the present stream
-      nrows_out = present_rows - s->present_out;  // Should always be a multiple of 8 except at
-                                                  // the end of the last row group
-      if (nrows_out > ((present_rows < s->chunk.num_rows) ? 130 * 8 : 0)) {
-        uint32_t present_out = s->present_out;
-        if (s->stream.ids[CI_PRESENT] >= 0) {
-          uint32_t flush = (present_rows < s->chunk.num_rows) ? 0 : 7;
-          nrows_out      = (nrows_out + flush) >> 3;
-          nrows_out =
-            ByteRLE<CI_PRESENT, 0x1ff>(s, s->valid_buf, present_out >> 3, nrows_out, flush, t) * 8;
-        }
-        __syncthreads();
-        if (!t) { s->present_out = min(present_out + nrows_out, present_rows); }
-      }
-      __syncthreads();
-    }
     // Fetch non-null values
     if (s->chunk.type_kind != LIST && !s->stream.data_ptrs[CI_DATA]) {
       // Pass-through
       __syncthreads();
       if (!t) {
-        s->cur_row           = s->present_rows;
-        s->strm_pos[CI_DATA] = s->cur_row * s->chunk.dtype_len;
+        s->cur_row           = s->chunk.num_rows;
+        s->strm_pos[CI_DATA] = s->chunk.num_rows * s->chunk.dtype_len;
       }
-      __syncthreads();
-    } else if (s->cur_row < s->present_rows) {
+    } else if (s->cur_row < s->chunk.num_rows) {
       uint32_t maxnumvals = (s->chunk.type_kind == BOOLEAN) ? 2048 : 1024;
       uint32_t nrows =
-        min(min(s->present_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)),
+        min(min(s->chunk.num_rows - s->cur_row, maxnumvals - max(s->numvals, s->numlengths)),
             encode_block_size);
-      auto const row_in_group = s->cur_row + t;
-      uint32_t const valid    = (t < nrows) ? validity(row_in_group) : 0;
-      s->buf.u32[t]           = valid;
+      auto const row = s->chunk.start_row + s->cur_row + t;
+
+      auto const is_value_valid = [&]() {
+        if (t >= nrows) return false;
+        return bit_value_or(pushdown_mask, column.offset() + row, true) and
+               bit_value_or(column.null_mask(), column.offset() + row, true);
+      }();
+      s->buf.u32[t] = is_value_valid ? 1u : 0u;
 
       // TODO: Could use a faster reduction relying on _popc() for the initial phase
       lengths_to_positions(s->buf.u32, encode_block_size, t);
       __syncthreads();
-      auto const row = s->chunk.start_row + row_in_group;
-      if (valid) {
+      if (is_value_valid) {
         int nz_idx = (s->nnz + s->buf.u32[t] - 1) & (maxnumvals - 1);
         switch (s->chunk.type_kind) {
           case INT:
           case DATE:
-          case FLOAT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint32_t>(row); break;
+          case FLOAT: s->vals.u32[nz_idx] = column.element<uint32_t>(row); break;
           case DOUBLE:
-          case LONG: s->vals.u64[nz_idx] = s->chunk.leaf_column->element<uint64_t>(row); break;
-          case SHORT: s->vals.u32[nz_idx] = s->chunk.leaf_column->element<uint16_t>(row); break;
+          case LONG: s->vals.u64[nz_idx] = column.element<uint64_t>(row); break;
+          case SHORT: s->vals.u32[nz_idx] = column.element<uint16_t>(row); break;
           case BOOLEAN:
-          case BYTE: s->vals.u8[nz_idx] = s->chunk.leaf_column->element<uint8_t>(row); break;
+          case BYTE: s->vals.u8[nz_idx] = column.element<uint8_t>(row); break;
           case TIMESTAMP: {
-            int64_t ts       = s->chunk.leaf_column->element<int64_t>(row);
+            int64_t ts       = column.element<int64_t>(row);
             int32_t ts_scale = kTimeScale[min(s->chunk.scale, 9)];
             int64_t seconds  = ts / ts_scale;
             int64_t nanos    = (ts - seconds * ts_scale);
@@ -796,7 +844,7 @@ __global__ void __launch_bounds__(block_size)
               }
               s->vals.u32[nz_idx] = dict_idx;
             } else {
-              string_view value = s->chunk.leaf_column->element<string_view>(row);
+              string_view value                       = column.element<string_view>(row);
               s->u.strenc.str_data[s->buf.u32[t] - 1] = value.data();
               s->lengths.u32[nz_idx]                  = value.size_bytes();
             }
@@ -805,11 +853,10 @@ __global__ void __launch_bounds__(block_size)
             // Note: can be written in a faster manner, given that all values are equal
           case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break;
           case LIST: {
-            auto const& offsets =
-              s->chunk.leaf_column->child(lists_column_view::offsets_column_index);
+            auto const& offsets = column.child(lists_column_view::offsets_column_index);
             // Compute list length from the offsets
-            s->lengths.u32[nz_idx] =
-              offsets.element<size_type>(row + 1) - offsets.element<size_type>(row);
+            s->lengths.u32[nz_idx] = offsets.element<size_type>(row + 1 + column.offset()) -
+                                     offsets.element<size_type>(row + column.offset());
           } break;
           default: break;
         }
@@ -897,10 +944,10 @@ __global__ void __launch_bounds__(block_size)
             }
             break;
           case DECIMAL: {
-            if (valid) {
-              uint64_t const zz_val = (s->chunk.leaf_column->type().id() == type_id::DECIMAL32)
-                                        ? zigzag(s->chunk.leaf_column->element<int32_t>(row))
-                                        : zigzag(s->chunk.leaf_column->element<int64_t>(row));
+            if (is_value_valid) {
+              uint64_t const zz_val = (column.type().id() == type_id::DECIMAL32)
+                                        ? zigzag(column.element<int32_t>(row))
+                                        : zigzag(column.element<int64_t>(row));
               auto const offset =
                 (row == s->chunk.start_row) ? 0 : s->chunk.decimal_offsets[row - 1];
               StoreVarint(s->stream.data_ptrs[CI_DATA] + offset, zz_val);
@@ -942,8 +989,8 @@ __global__ void __launch_bounds__(block_size)
       streams[col_id][group_id].lengths[t] = s->strm_pos[t];
     if (!s->stream.data_ptrs[t]) {
       streams[col_id][group_id].data_ptrs[t] =
-        static_cast<uint8_t*>(const_cast<void*>(s->chunk.leaf_column->head())) +
-        (s->chunk.leaf_column->offset() + s->chunk.start_row) * s->chunk.dtype_len;
+        static_cast<uint8_t*>(const_cast<void*>(column.head())) +
+        (column.offset() + s->chunk.start_row) * s->chunk.dtype_len;
     }
   }
 }
@@ -1033,16 +1080,6 @@ __global__ void __launch_bounds__(block_size)
   if (t == 0) { strm_ptr->lengths[cid] = s->strm_pos[cid]; }
 }
 
-__global__ void __launch_bounds__(512)
-  gpu_set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                        device_2dspan<EncChunk> chunks)
-{
-  // Set leaf_column member of EncChunk
-  for (size_type i = threadIdx.x; i < chunks.size().second; i += blockDim.x) {
-    chunks[blockIdx.x][i].leaf_column = &orc_columns[blockIdx.x].cudf_column;
-  }
-}
-
 /**
  * @brief Merge chunked column data into a single contiguous stream
  *
@@ -1255,16 +1292,6 @@ void EncodeStripeDictionaries(StripeDictionary const* stripes,
     <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, chunks, enc_streams);
 }
 
-void set_chunk_columns(device_span<orc_column_device_view const> orc_columns,
-                       device_2dspan<EncChunk> chunks,
-                       rmm::cuda_stream_view stream)
-{
-  dim3 dim_block(512, 1);
-  dim3 dim_grid(chunks.size().first, 1);
-
-  gpu_set_chunk_columns<<<dim_grid, dim_block, 0, stream.value()>>>(orc_columns, chunks);
-}
-
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
                            device_2dspan<encoder_chunk_streams> enc_streams,
                            rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index d6dbdbe6403..be561530459 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -19,6 +19,7 @@
 
 #include <io/utilities/block_utils.cuh>
 
+#include <cub/cub.cuh>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
@@ -473,6 +474,45 @@ extern "C" __global__ void __launch_bounds__(128, 8)
   }
 }
 
+template <int block_size>
+__global__ void __launch_bounds__(block_size)
+  gpu_reduce_pushdown_masks(device_span<orc_column_device_view const> orc_columns,
+                            device_2dspan<rowgroup_rows const> rowgroup_bounds,
+                            device_2dspan<size_type> set_counts)
+{
+  typedef cub::BlockReduce<size_type, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  auto const column_id   = blockIdx.x;
+  auto const rowgroup_id = blockIdx.y;
+  auto const column      = orc_columns[column_id];
+  auto const t           = threadIdx.x;
+
+  auto const use_child_rg = column.type().id() == type_id::LIST;
+  auto const rg           = rowgroup_bounds[rowgroup_id][column_id + (use_child_rg ? 1 : 0)];
+
+  if (column.pushdown_mask == nullptr) {
+    // All elements are valid if the null mask is not present
+    if (t == 0) { set_counts[rowgroup_id][column_id] = rg.size(); }
+    return;
+  };
+
+  size_type count                          = 0;
+  static constexpr size_type bits_per_word = sizeof(bitmask_type) * 8;
+  for (auto row = t * bits_per_word + rg.begin; row < rg.end; row += block_size * bits_per_word) {
+    auto const begin_bit = row;
+    auto const end_bit   = min(static_cast<size_type>(row + bits_per_word), rg.end);
+    auto const mask_len  = end_bit - begin_bit;
+    auto const mask_word =
+      cudf::detail::get_mask_offset_word(column.pushdown_mask, 0, row, end_bit) &
+      ((1 << mask_len) - 1);
+    count += __popc(mask_word);
+  }
+
+  count = BlockReduce(temp_storage).Sum(count);
+  if (t == 0) { set_counts[rowgroup_id][column_id] = count; }
+}
+
 void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         int32_t num_streams,
                                         uint32_t compression_block_size,
@@ -495,19 +535,6 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                                                              num_streams);
 }
 
-/**
- * @brief Launches kernel for constructing rowgroup from index streams
- *
- * @param[out] row_groups RowGroup device array [rowgroup][column]
- * @param[in] strm_info List of compressed streams (or NULL if uncompressed)
- * @param[in] chunks ColumnDesc device array [stripe][column]
- * @param[in] num_columns Number of columns
- * @param[in] num_stripes Number of stripes
- * @param[in] num_rowgroups Number of row groups
- * @param[in] rowidx_stride Row index stride
- * @param[in] use_base_stride Whether to use base stride obtained from meta or the computed value
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- */
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,
                                  CompressedStreamInfo* strm_info,
                                  ColumnDesc* chunks,
@@ -530,6 +557,17 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups,
                                                                     use_base_stride);
 }
 
+void __host__ reduce_pushdown_masks(device_span<orc_column_device_view const> columns,
+                                    device_2dspan<rowgroup_rows const> rowgroups,
+                                    device_2dspan<cudf::size_type> valid_counts,
+                                    rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(128, 1);
+  dim3 dim_grid(columns.size(), rowgroups.size().first);  // 1 rowgroup per block
+  gpu_reduce_pushdown_masks<128>
+    <<<dim_grid, dim_block, 0, stream.value()>>>(columns, rowgroups, valid_counts);
+}
+
 }  // namespace gpu
 }  // namespace orc
 }  // namespace io
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 8a0112deb76..299c8fbb730 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -99,6 +99,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
     case cudf::type_id::DECIMAL32:
     case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
     case cudf::type_id::LIST: return TypeKind::LIST;
+    case cudf::type_id::STRUCT: return TypeKind::STRUCT;
     default: return TypeKind::INVALID_TYPE_KIND;
   }
 }
@@ -142,30 +143,30 @@ class orc_column_view {
    */
   explicit orc_column_view(uint32_t index,
                            int str_idx,
-                           int index_in_table,
+                           orc_column_view* parent,
                            column_view const& col,
-                           const table_metadata* metadata)
+                           column_in_metadata const& metadata)
     : cudf_column{col},
       _index{index},
       _str_idx{str_idx},
-      _is_child{index_in_table < 0},
+      _is_child{parent != nullptr},
       _type_width{cudf::is_fixed_width(col.type()) ? cudf::size_of(col.type()) : 0},
       _scale{(to_orc_type(col.type().id()) == TypeKind::DECIMAL) ? -col.type().scale()
                                                                  : to_clockscale(col.type().id())},
-      _precision{orc_precision(col.type().id())},
-      _type_kind{to_orc_type(col.type().id())}
+      _precision{metadata.is_decimal_precision_set() ? metadata.get_decimal_precision()
+                                                     : orc_precision(col.type().id())},
+      _type_kind{to_orc_type(col.type().id())},
+      name{metadata.get_name()}
   {
-    // Don't assign names to child columns
-    if (index_in_table >= 0) {
-      if (metadata != nullptr && index_in_table < static_cast<int>(metadata->column_names.size())) {
-        _name = metadata->column_names[index_in_table];
-      } else {
-        // Generating default name if name isn't present in metadata
-        _name = "_col" + std::to_string(index_in_table);
-      }
+    if (metadata.is_nullability_defined()) { nullable_from_metadata = metadata.nullable(); }
+    if (parent != nullptr) {
+      parent->add_child(_index);
+      _parent_index = parent->index();
     }
   }
 
+  void add_child(uint32_t child_idx) { children.emplace_back(child_idx); }
+
   auto is_string() const noexcept { return cudf_column.type().id() == type_id::STRING; }
   void set_dict_stride(size_t stride) noexcept { _dict_stride = stride; }
   auto dict_stride() const noexcept { return _dict_stride; }
@@ -206,15 +207,22 @@ class orc_column_view {
   auto device_stripe_dict() const noexcept { return d_stripe_dict; }
 
   // Index in the table
-  auto index() const noexcept { return _index; }
+  uint32_t index() const noexcept { return _index; }
   // Id in the ORC file
   auto id() const noexcept { return _index + 1; }
+
   auto is_child() const noexcept { return _is_child; }
+  auto parent_index() const noexcept { return _parent_index.value(); }
+  auto child_begin() const noexcept { return children.cbegin(); }
+  auto child_end() const noexcept { return children.cend(); }
+
   auto type_width() const noexcept { return _type_width; }
   auto size() const noexcept { return cudf_column.size(); }
+
   auto null_count() const noexcept { return cudf_column.null_count(); }
   auto null_mask() const noexcept { return cudf_column.null_mask(); }
   bool nullable() const noexcept { return null_mask() != nullptr; }
+  auto user_defined_nullable() const noexcept { return nullable_from_metadata; }
 
   auto scale() const noexcept { return _scale; }
   auto precision() const noexcept { return _precision; }
@@ -222,7 +230,7 @@ class orc_column_view {
   void set_orc_encoding(ColumnEncodingKind e) noexcept { _encoding_kind = e; }
   auto orc_kind() const noexcept { return _type_kind; }
   auto orc_encoding() const noexcept { return _encoding_kind; }
-  auto orc_name() const noexcept { return _name; }
+  std::string_view orc_name() const noexcept { return name; }
 
  private:
   column_view cudf_column;
@@ -238,9 +246,9 @@ class orc_column_view {
   int32_t _precision = 0;
 
   // ORC-related members
-  std::string _name{};
-  TypeKind _type_kind;
-  ColumnEncodingKind _encoding_kind;
+  TypeKind _type_kind               = INVALID_TYPE_KIND;
+  ColumnEncodingKind _encoding_kind = INVALID_ENCODING_KIND;
+  std::string name;
 
   // String dictionary-related members
   size_t _dict_stride                        = 0;
@@ -252,6 +260,10 @@ class orc_column_view {
   // Offsets for encoded decimal elements. Used to enable direct writing of encoded decimal elements
   // into the output stream.
   uint32_t* d_decimal_offsets = nullptr;
+
+  std::optional<bool> nullable_from_metadata;
+  std::vector<uint32_t> children;
+  std::optional<uint32_t> _parent_index;
 };
 
 size_type orc_table_view::num_rows() const noexcept
@@ -476,11 +488,13 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
       if (single_write_mode) {
         return column.nullable();
       } else {
-        if (user_metadata_with_nullability.column_nullable.empty()) return true;
-        CUDF_EXPECTS(user_metadata_with_nullability.column_nullable.size() > column.index(),
-                     "When passing values in user_metadata_with_nullability, data for all columns "
-                     "must be specified");
-        return user_metadata_with_nullability.column_nullable[column.index()];
+        // For chunked write, when not provided nullability, we assume the worst case scenario
+        // that all columns are nullable.
+        auto const chunked_nullable = column.user_defined_nullable().value_or(true);
+        CUDF_EXPECTS(chunked_nullable or !column.nullable(),
+                     "Mismatch in metadata prescribed nullability and input column nullability. "
+                     "Metadata for nullable input column cannot prescribe nullability = false");
+        return chunked_nullable;
       }
     }();
 
@@ -594,6 +608,9 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
         break;
+      case TypeKind::STRUCT:
+        // Only has the present stream
+        break;
       default: CUDF_FAIL("Unsupported ORC type kind");
     }
   }
@@ -641,16 +658,161 @@ orc_streams::orc_stream_offsets orc_streams::compute_offsets(
   return {std::move(strm_offsets), non_rle_data_size, rle_data_size};
 }
 
+std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
+  orc_table_view const& orc_table,
+  file_segmentation const& segmentation,
+  rmm::cuda_stream_view stream)
+{
+  if (segmentation.num_rowgroups() == 0) return {};
+
+  auto d_pd_set_counts_data = rmm::device_uvector<cudf::size_type>(
+    orc_table.num_columns() * segmentation.num_rowgroups(), stream);
+  auto const d_pd_set_counts = device_2dspan<cudf::size_type>{
+    d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()};
+  gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream);
+
+  auto aligned_rgs = hostdevice_2dvector<rowgroup_rows>(
+    segmentation.num_rowgroups(), orc_table.num_columns(), stream);
+  CUDA_TRY(cudaMemcpyAsync(aligned_rgs.base_device_ptr(),
+                           segmentation.rowgroups.base_device_ptr(),
+                           aligned_rgs.count() * sizeof(rowgroup_rows),
+                           cudaMemcpyDefault,
+                           stream.value()));
+  auto const d_stripes = cudf::detail::make_device_uvector_async(segmentation.stripes, stream);
+
+  // One thread per column, per stripe
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    orc_table.num_columns() * segmentation.num_stripes(),
+    [columns = device_span<orc_column_device_view const>{orc_table.d_columns},
+     stripes = device_span<stripe_rowgroups const>{d_stripes},
+     d_pd_set_counts,
+     out_rowgroups = device_2dspan<rowgroup_rows>{aligned_rgs}] __device__(auto& idx) {
+      uint32_t const col_idx = idx / stripes.size();
+      // No alignment needed for root columns
+      if (not columns[col_idx].parent_index.has_value()) return;
+
+      auto const stripe_idx     = idx % stripes.size();
+      auto const stripe         = stripes[stripe_idx];
+      auto const parent_col_idx = columns[col_idx].parent_index.value();
+      auto const parent_column  = columns[parent_col_idx];
+      auto const stripe_end     = stripe.first + stripe.size;
+
+      auto seek_last_borrow_rg = [&](auto rg_idx, size_type& bits_to_borrow) {
+        auto curr         = rg_idx + 1;
+        auto curr_rg_size = [&]() {
+          return parent_column.pushdown_mask != nullptr ? d_pd_set_counts[curr][parent_col_idx]
+                                                        : out_rowgroups[curr][col_idx].size();
+        };
+        while (curr < stripe_end and curr_rg_size() <= bits_to_borrow) {
+          // All bits from rowgroup borrowed, make the rowgroup empty
+          out_rowgroups[curr][col_idx].begin = out_rowgroups[curr][col_idx].end;
+          bits_to_borrow -= curr_rg_size();
+          ++curr;
+        }
+        return curr;
+      };
+
+      int previously_borrowed = 0;
+      for (auto rg_idx = stripe.first; rg_idx + 1 < stripe_end; ++rg_idx) {
+        auto& rg = out_rowgroups[rg_idx][col_idx];
+
+        if (parent_column.pushdown_mask == nullptr) {
+          // No pushdown mask, all null mask bits will be encoded
+          // Align on rowgroup size (can be misaligned for list children)
+          if (rg.size() % 8) {
+            auto bits_to_borrow           = 8 - rg.size() % 8;
+            auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow);
+            if (last_borrow_rg_idx == stripe_end) {
+              // Didn't find enough bits to borrow, move the rowgroup end to the stripe end
+              rg.end = out_rowgroups[stripe_end - 1][col_idx].end;
+              // Done with this stripe
+              break;
+            }
+            auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx];
+            last_borrow_rg.begin += bits_to_borrow;
+            rg.end = last_borrow_rg.begin;
+            // Skip the rowgroups we emptied in the loop
+            rg_idx = last_borrow_rg_idx - 1;
+          }
+        } else {
+          // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded
+          // Use the number of set bits in pushdown mask as size
+          auto bits_to_borrow =
+            8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8;
+          if (bits_to_borrow == 0) {
+            // Didn't borrow any bits for this rowgroup
+            previously_borrowed = 0;
+            continue;
+          }
+
+          // Find rowgroup in which we finish the search for missing bits
+          auto const last_borrow_rg_idx = seek_last_borrow_rg(rg_idx, bits_to_borrow);
+          if (last_borrow_rg_idx == stripe_end) {
+            // Didn't find enough bits to borrow, move the rowgroup end to the stripe end
+            rg.end = out_rowgroups[stripe_end - 1][col_idx].end;
+            // Done with this stripe
+            break;
+          }
+
+          auto& last_borrow_rg = out_rowgroups[last_borrow_rg_idx][col_idx];
+          // First row that does not need to be borrowed
+          auto borrow_end = last_borrow_rg.begin;
+
+          // Adjust the number of bits to borrow in the next iteration
+          previously_borrowed = bits_to_borrow;
+
+          // Find word in which we finish the search for missing bits (guaranteed to be available)
+          while (bits_to_borrow != 0) {
+            auto const mask = cudf::detail::get_mask_offset_word(
+              parent_column.pushdown_mask, 0, borrow_end, borrow_end + 32);
+            auto const valid_in_word = __popc(mask);
+
+            if (valid_in_word > bits_to_borrow) break;
+            bits_to_borrow -= valid_in_word;
+            borrow_end += 32;
+          }
+
+          // Find the last of the missing bits (guaranteed to be available)
+          while (bits_to_borrow != 0) {
+            if (bit_is_set(parent_column.pushdown_mask, borrow_end)) { --bits_to_borrow; };
+            ++borrow_end;
+          }
+
+          last_borrow_rg.begin = borrow_end;
+          rg.end               = borrow_end;
+          // Skip the rowgroups we emptied in the loop
+          rg_idx = last_borrow_rg_idx - 1;
+        }
+      }
+    });
+
+  aligned_rgs.device_to_host(stream, true);
+
+  std::vector<std::vector<rowgroup_rows>> h_aligned_rgs;
+  h_aligned_rgs.reserve(segmentation.num_rowgroups());
+  std::transform(thrust::make_counting_iterator(0ul),
+                 thrust::make_counting_iterator(segmentation.num_rowgroups()),
+                 std::back_inserter(h_aligned_rgs),
+                 [&](auto idx) -> std::vector<rowgroup_rows> {
+                   return {aligned_rgs[idx].begin(), aligned_rgs[idx].end()};
+                 });
+
+  return h_aligned_rgs;
+}
+
 struct segmented_valid_cnt_input {
   bitmask_type const* mask;
   std::vector<size_type> indices;
 };
 
-encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
-                                          string_dictionaries&& dictionaries,
-                                          encoder_decimal_info&& dec_chunk_sizes,
-                                          file_segmentation const& segmentation,
-                                          orc_streams const& streams)
+encoded_data encode_columns(orc_table_view const& orc_table,
+                            string_dictionaries&& dictionaries,
+                            encoder_decimal_info&& dec_chunk_sizes,
+                            file_segmentation const& segmentation,
+                            orc_streams const& streams,
+                            rmm::cuda_stream_view stream)
 {
   auto const num_columns = orc_table.num_columns();
   hostdevice_2dvector<gpu::EncChunk> chunks(num_columns, segmentation.num_rowgroups(), stream);
@@ -658,19 +820,22 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
     streams.compute_offsets(orc_table.columns, segmentation.num_rowgroups());
   rmm::device_uvector<uint8_t> encoded_data(stream_offsets.data_size(), stream);
 
+  auto const aligned_rowgroups = calculate_aligned_rowgroup_bounds(orc_table, segmentation, stream);
+
   // Initialize column chunks' descriptions
   std::map<size_type, segmented_valid_cnt_input> validity_check_inputs;
 
   for (auto const& column : orc_table.columns) {
     for (auto const& stripe : segmentation.stripes) {
       for (auto rg_idx_it = stripe.cbegin(); rg_idx_it < stripe.cend(); ++rg_idx_it) {
-        auto const rg_idx = *rg_idx_it;
-        auto& ck          = chunks[column.index()][rg_idx];
-
-        ck.start_row     = segmentation.rowgroups[rg_idx][column.index()].begin;
-        ck.num_rows      = segmentation.rowgroups[rg_idx][column.index()].size();
-        ck.encoding_kind = column.orc_encoding();
-        ck.type_kind     = column.orc_kind();
+        auto const rg_idx      = *rg_idx_it;
+        auto& ck               = chunks[column.index()][rg_idx];
+        ck.start_row           = segmentation.rowgroups[rg_idx][column.index()].begin;
+        ck.num_rows            = segmentation.rowgroups[rg_idx][column.index()].size();
+        ck.null_mask_start_row = aligned_rowgroups[rg_idx][column.index()].begin;
+        ck.null_mask_num_rows  = aligned_rowgroups[rg_idx][column.index()].size();
+        ck.encoding_kind       = column.orc_encoding();
+        ck.type_kind           = column.orc_kind();
         if (ck.type_kind == TypeKind::STRING) {
           ck.dict_index = (ck.encoding_kind == DICTIONARY_V2)
                             ? column.host_stripe_dict(stripe.id)->dict_index
@@ -684,6 +849,19 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
       }
     }
   }
+  chunks.host_to_device(stream);
+  // TODO (future): pass columns separately from chunks (to skip this step)
+  // and remove info from chunks that is common for the entire column
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0ul),
+    chunks.count(),
+    [chunks = device_2dspan<gpu::EncChunk>{chunks},
+     cols = device_span<orc_column_device_view const>{orc_table.d_columns}] __device__(auto& idx) {
+      auto const col_idx             = idx / chunks.size().second;
+      auto const rg_idx              = idx % chunks.size().second;
+      chunks[col_idx][rg_idx].column = &cols[col_idx];
+    });
 
   auto validity_check_indices = [&](size_t col_idx) {
     std::vector<size_type> indices;
@@ -789,12 +967,8 @@ encoded_data writer::impl::encode_columns(orc_table_view const& orc_table,
       }
     }
   }
-
-  chunks.host_to_device(stream);
   chunk_streams.host_to_device(stream);
 
-  gpu::set_chunk_columns(orc_table.d_columns, chunks, stream);
-
   if (orc_table.num_string_columns() != 0) {
     auto d_stripe_dict = orc_table.string_column(0).device_stripe_dict();
     gpu::EncodeStripeDictionaries(d_stripe_dict,
@@ -856,11 +1030,10 @@ void set_stat_desc_leaf_cols(device_span<orc_column_device_view const> columns,
                              device_span<stats_column_desc> stat_desc,
                              rmm::cuda_stream_view stream)
 {
-  thrust::for_each(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator(0ul),
-    thrust::make_counting_iterator(stat_desc.size()),
-    [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx].cudf_column; });
+  thrust::for_each(rmm::exec_policy(stream),
+                   thrust::make_counting_iterator(0ul),
+                   thrust::make_counting_iterator(stat_desc.size()),
+                   [=] __device__(auto idx) { stat_desc[idx].leaf_column = &columns[idx]; });
 }
 
 std::vector<std::vector<uint8_t>> writer::impl::gather_statistic_blobs(
@@ -1101,14 +1274,16 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : compression_kind_(to_orc_compression(options.get_compression())),
+  : _mr(mr),
+    stream(stream),
+    compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
-    out_sink_(std::move(sink)),
     single_write_mode(mode == SingleWriteMode::YES),
-    user_metadata(options.get_metadata()),
-    stream(stream),
-    _mr(mr)
+    out_sink_(std::move(sink))
 {
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
+  }
   init_state();
 }
 
@@ -1117,18 +1292,16 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : compression_kind_(to_orc_compression(options.get_compression())),
+  : _mr(mr),
+    stream(stream),
+    compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
-    out_sink_(std::move(sink)),
     single_write_mode(mode == SingleWriteMode::YES),
-    stream(stream),
-    _mr(mr)
+    out_sink_(std::move(sink))
 {
-  if (options.get_metadata() != nullptr) {
-    user_metadata_with_nullability = *options.get_metadata();
-    user_metadata                  = &user_metadata_with_nullability;
+  if (options.get_metadata()) {
+    table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-
   init_state();
 }
 
@@ -1140,6 +1313,113 @@ void writer::impl::init_state()
   out_sink_->host_write(MAGIC, std::strlen(MAGIC));
 }
 
+void pushdown_lists_null_mask(orc_column_view const& col,
+                              device_span<orc_column_device_view> d_columns,
+                              bitmask_type const* parent_pd_mask,
+                              device_span<bitmask_type> out_mask,
+                              rmm::cuda_stream_view stream)
+{
+  // Set all bits - correct unless there's a mismatch between offsets and null mask
+  CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out_mask.data()),
+                           255,
+                           out_mask.size() * sizeof(bitmask_type),
+                           stream.value()));
+
+  // Reset bits where a null list element has rows in the child column
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0u),
+    col.size(),
+    [d_columns, col_idx = col.index(), parent_pd_mask, out_mask] __device__(auto& idx) {
+      auto const d_col        = d_columns[col_idx];
+      auto const is_row_valid = d_col.is_valid(idx) and bit_value_or(parent_pd_mask, idx, true);
+      if (not is_row_valid) {
+        auto offsets                = d_col.child(lists_column_view::offsets_column_index);
+        auto const child_rows_begin = offsets.element<size_type>(idx + d_col.offset());
+        auto const child_rows_end   = offsets.element<size_type>(idx + 1 + d_col.offset());
+        for (auto child_row = child_rows_begin; child_row < child_rows_end; ++child_row)
+          clear_bit(out_mask.data(), child_row);
+      }
+    });
+}
+
+/**
+ * @brief All pushdown masks in a table.
+ *
+ * Pushdown masks are applied to child column(s). Only bits of the child column null mask that
+ * correspond to set pushdown mask bits are encoded into the output file. Similarly, rows where
+ * pushdown mask is 0 are treated as invalid and not included in the output.
+ */
+struct pushdown_null_masks {
+  // Owning vector for masks in device memory
+  std::vector<rmm::device_uvector<bitmask_type>> data;
+  // Pointers to pushdown masks in device memory. Can be same for multiple columns.
+  std::vector<bitmask_type const*> masks;
+};
+
+pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
+                                             rmm::cuda_stream_view stream)
+{
+  std::vector<bitmask_type const*> mask_ptrs;
+  mask_ptrs.reserve(orc_table.num_columns());
+  std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
+  for (auto const& col : orc_table.columns) {
+    // Leaf columns don't need pushdown masks
+    if (col.orc_kind() != LIST && col.orc_kind() != STRUCT) {
+      mask_ptrs.emplace_back(nullptr);
+      continue;
+    }
+    auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
+    auto const null_mask      = col.null_mask();
+
+    if (null_mask == nullptr and parent_pd_mask == nullptr) {
+      mask_ptrs.emplace_back(nullptr);
+      continue;
+    }
+    if (col.orc_kind() == STRUCT) {
+      if (null_mask != nullptr and parent_pd_mask == nullptr) {
+        // Reuse own null mask
+        mask_ptrs.emplace_back(null_mask);
+      } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
+        // Reuse parent's pushdown mask
+        mask_ptrs.emplace_back(parent_pd_mask);
+      } else {
+        // Both are nullable, allocate new pushdown mask
+        pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
+        mask_ptrs.emplace_back(pd_masks.back().data());
+
+        thrust::transform(rmm::exec_policy(stream),
+                          null_mask,
+                          null_mask + pd_masks.back().size(),
+                          parent_pd_mask,
+                          pd_masks.back().data(),
+                          thrust::bit_and<bitmask_type>());
+      }
+    }
+    if (col.orc_kind() == LIST) {
+      // Need a new pushdown mask unless both the parent and current colmn are not nullable
+      auto const child_col = orc_table.column(col.child_begin()[0]);
+      // pushdown mask applies to child column; use the child column size
+      pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
+      mask_ptrs.emplace_back(pd_masks.back().data());
+      pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
+    }
+  }
+
+  // Attach null masks to device column views (async)
+  auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(mask_ptrs, stream);
+  thrust::for_each_n(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0ul),
+    orc_table.num_columns(),
+    [cols = device_span<orc_column_device_view>{orc_table.d_columns},
+     ptrs = device_span<bitmask_type const* const>{d_mask_ptrs}] __device__(auto& idx) {
+      cols[idx].pushdown_mask = ptrs[idx];
+    });
+
+  return {std::move(pd_masks), std::move(mask_ptrs)};
+}
+
 template <typename T>
 struct device_stack {
   __device__ device_stack(T* stack_storage, int capacity)
@@ -1166,28 +1446,35 @@ struct device_stack {
 
 orc_table_view make_orc_table_view(table_view const& table,
                                    table_device_view const& d_table,
-                                   table_metadata const* user_metadata,
+                                   table_input_metadata const& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   std::vector<orc_column_view> orc_columns;
   std::vector<uint32_t> str_col_indexes;
 
-  std::function<void(column_view const&, int)> append_orc_column = [&](column_view const& col,
-                                                                       int index_in_table) {
-    int const str_idx =
-      (col.type().id() == type_id::STRING) ? static_cast<int>(str_col_indexes.size()) : -1;
-    auto const& new_col =
-      orc_columns.emplace_back(orc_columns.size(), str_idx, index_in_table, col, user_metadata);
-    if (new_col.is_string()) { str_col_indexes.push_back(new_col.index()); }
-    if (col.type().id() == type_id::LIST)
-      append_orc_column(col.child(lists_column_view::child_column_index), -1);
-    if (col.type().id() == type_id::STRUCT)
-      for (auto child = col.child_begin(); child != col.child_end(); ++child)
-        append_orc_column(*child, -1);
-  };
+  std::function<void(column_view const&, orc_column_view*, column_in_metadata const&)>
+    append_orc_column =
+      [&](column_view const& col, orc_column_view* parent_col, column_in_metadata const& col_meta) {
+        int const str_idx =
+          (col.type().id() == type_id::STRING) ? static_cast<int>(str_col_indexes.size()) : -1;
+
+        auto const new_col_idx = orc_columns.size();
+        orc_columns.emplace_back(new_col_idx, str_idx, parent_col, col, col_meta);
+        if (orc_columns[new_col_idx].is_string()) { str_col_indexes.push_back(new_col_idx); }
+
+        if (col.type().id() == type_id::LIST) {
+          append_orc_column(col.child(lists_column_view::child_column_index),
+                            &orc_columns[new_col_idx],
+                            col_meta.child(lists_column_view::child_column_index));
+        } else if (col.type().id() == type_id::STRUCT) {
+          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx)
+            append_orc_column(
+              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
+        }
+      };
 
   for (auto col_idx = 0; col_idx < table.num_columns(); ++col_idx) {
-    append_orc_column(table.column(col_idx), col_idx);
+    append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
@@ -1256,19 +1543,24 @@ hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view cons
           // Root column
           if (!col.parent_index.has_value()) {
             size_type const rows_begin = rg_idx * rowgroup_size;
-            auto const rows_end =
-              thrust::min<size_type>((rg_idx + 1) * rowgroup_size, col.cudf_column.size());
+            auto const rows_end = thrust::min<size_type>((rg_idx + 1) * rowgroup_size, col.size());
             return rowgroup_rows{rows_begin, rows_end};
           } else {
             // Child column
-            auto const parent_index       = *col.parent_index;
-            column_device_view parent_col = cols[parent_index].cudf_column;
-            if (parent_col.type().id() != type_id::LIST) return rg_bounds[rg_idx][parent_index];
-
-            auto parent_offsets = parent_col.child(lists_column_view::offsets_column_index);
-            auto const& parent_rowgroup_rows = rg_bounds[rg_idx][parent_index];
-            auto const rows_begin = parent_offsets.element<size_type>(parent_rowgroup_rows.begin);
-            auto const rows_end   = parent_offsets.element<size_type>(parent_rowgroup_rows.end);
+            auto const parent_index           = *col.parent_index;
+            orc_column_device_view parent_col = cols[parent_index];
+            auto const parent_rg              = rg_bounds[rg_idx][parent_index];
+            if (parent_col.type().id() != type_id::LIST) {
+              auto const offset_diff = parent_col.offset() - col.offset();
+              return rowgroup_rows{parent_rg.begin + offset_diff, parent_rg.end + offset_diff};
+            }
+
+            auto offsets = parent_col.child(lists_column_view::offsets_column_index);
+            auto const rows_begin =
+              offsets.element<size_type>(parent_rg.begin + parent_col.offset()) - col.offset();
+            auto const rows_end =
+              offsets.element<size_type>(parent_rg.end + parent_col.offset()) - col.offset();
+
             return rowgroup_rows{rows_begin, rows_end};
           }
         });
@@ -1295,8 +1587,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                        current_sizes.end(),
                        [d_cols  = device_span<orc_column_device_view const>{orc_table.d_columns},
                         col_idx = orc_col.index()] __device__(auto idx) {
-                         auto const& col = d_cols[col_idx].cudf_column;
-                         if (col.is_null(idx)) return 0u;
+                         auto const& col          = d_cols[col_idx];
+                         auto const pushdown_mask = [&]() -> cudf::bitmask_type const* {
+                           auto const parent_index = d_cols[col_idx].parent_index;
+                           if (!parent_index.has_value()) return nullptr;
+                           return d_cols[parent_index.value()].pushdown_mask;
+                         }();
+                         if (col.is_null(idx) or not bit_value_or(pushdown_mask, idx, true))
+                           return 0u;
                          int64_t const element   = (col.type().id() == type_id::DECIMAL32)
                                                      ? col.element<int32_t>(idx)
                                                      : col.element<int64_t>(idx);
@@ -1418,9 +1716,25 @@ void writer::impl::write(table_view const& table)
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
   auto const num_rows = table.num_rows();
 
+  if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
+
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "." + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
+    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
+  }
+
   auto const d_table = table_device_view::create(table, stream);
 
-  auto orc_table = make_orc_table_view(table, *d_table, user_metadata, stream);
+  auto orc_table = make_orc_table_view(table, *d_table, *table_meta, stream);
+
+  auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
   auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride_, stream);
 
@@ -1458,7 +1772,7 @@ void writer::impl::write(table_view const& table)
   auto streams =
     create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
   auto enc_data = encode_columns(
-    orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams);
+    orc_table, std::move(dictionaries), std::move(dec_chunk_sizes), segmentation, streams, stream);
 
   // Assemble individual disparate column chunks into contiguous data streams
   size_type const num_index_streams = (orc_table.num_columns() + 1);
@@ -1646,6 +1960,18 @@ void writer::impl::write(table_view const& table)
       }
       // In preorder traversal the column after a list column is always the child column
       if (column.orc_kind() == LIST) { schema_type.subtypes.emplace_back(column.id() + 1); }
+      if (column.orc_kind() == STRUCT) {
+        std::transform(column.child_begin(),
+                       column.child_end(),
+                       std::back_inserter(schema_type.subtypes),
+                       [&](auto const& child_idx) { return orc_table.column(child_idx).id(); });
+        std::transform(column.child_begin(),
+                       column.child_end(),
+                       std::back_inserter(schema_type.fieldNames),
+                       [&](auto const& child_idx) {
+                         return std::string{orc_table.column(child_idx).orc_name()};
+                       });
+      }
     }
   } else {
     // verify the user isn't passing mismatched tables
@@ -1671,11 +1997,13 @@ void writer::impl::close()
   PostScript ps;
 
   ff.contentLength = out_sink_->bytes_written();
-  if (user_metadata) {
-    for (auto it = user_metadata->user_data.begin(); it != user_metadata->user_data.end(); it++) {
-      ff.metadata.push_back({it->first, it->second});
-    }
-  }
+  std::transform(table_meta->user_data.begin(),
+                 table_meta->user_data.end(),
+                 std::back_inserter(ff.metadata),
+                 [&](auto const& udata) {
+                   return UserMetadataItem{udata.first, udata.second};
+                 });
+
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
     buffer_.resize((compression_kind_ != NONE) ? 3 : 0);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 787bdeb3a4e..a8fe22a360f 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -262,23 +262,6 @@ class writer::impl {
                              file_segmentation const& segmentation,
                              std::map<uint32_t, size_t> const& decimal_column_sizes);
 
-  /**
-   * @brief Encodes the input columns into streams.
-   *
-   * @param orc_table Non-owning view of a cuDF table w/ ORC-related info
-   * @param dict_data Dictionary data memory
-   * @param dict_index Dictionary index memory
-   * @param dec_chunk_sizes Information about size of encoded decimal columns
-   * @param segmentation stripe and rowgroup ranges
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Encoded data and per-chunk stream descriptors
-   */
-  encoded_data encode_columns(orc_table_view const& orc_table,
-                              string_dictionaries&& dictionaries,
-                              encoder_decimal_info&& dec_chunk_sizes,
-                              file_segmentation const& segmentation,
-                              orc_streams const& streams);
-
   /**
    * @brief Returns stripe information after compacting columns' individual data
    * chunks into contiguous data streams.
@@ -375,14 +358,11 @@ class writer::impl {
   cudf::io::orc::Metadata md;
   // current write position for rowgroups/chunks
   size_t current_chunk_offset;
-  // optional user metadata
-  table_metadata const* user_metadata = nullptr;
-  // only used in the write_chunked() case. copied from the (optionally) user supplied
-  // argument to write_chunked_begin()
-  table_metadata_with_nullability user_metadata_with_nullability;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
   // a single table write.  this enables some internal optimizations.
   bool const single_write_mode;
+  // optional user metadata
+  std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
 
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
new file mode 100644
index 00000000000..39617c99690
--- /dev/null
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/io_metadata_utilities.hpp>
+
+#include <gmock/gmock.h>
+
+namespace cudf::test {
+
+void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
+                           cudf::io::table_metadata out_meta)
+{
+  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
+    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
+      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
+      ASSERT_EQ(out_col.children.size(), in_col.num_children());
+      for (size_t i = 0; i < out_col.children.size(); ++i) {
+        compare_names(out_col.children[i], in_col.child(i));
+      }
+    };
+
+  ASSERT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
+
+  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
+    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
+  }
+}
+
+}  // namespace cudf::test
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index fbeba925f1b..cdf0a3b275b 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -161,14 +162,10 @@ struct SkipRowTest {
     auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
     column_wrapper<int32_t, typename decltype(sequence)::value_type> input_col(
       sequence, sequence + file_num_rows, validity);
-
-    std::vector<std::unique_ptr<column>> input_cols;
-    input_cols.push_back(input_col.release());
-    auto input_table = std::make_unique<table>(std::move(input_cols));
-    EXPECT_EQ(1, input_table->num_columns());
+    table_view input_table({input_col});
 
     cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table->view());
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table);
     cudf_io::write_orc(out_opts);
 
     auto begin_sequence = sequence, end_sequence = sequence;
@@ -180,9 +177,7 @@ struct SkipRowTest {
       begin_sequence, end_sequence, validity);
     std::vector<std::unique_ptr<column>> output_cols;
     output_cols.push_back(output_col.release());
-    auto expected = std::make_unique<table>(std::move(output_cols));
-    EXPECT_EQ(1, expected->num_columns());
-    return expected;
+    return std::make_unique<table>(std::move(output_cols));
   }
 
   void test(int skip_rows, int file_num_rows, int read_num_rows)
@@ -224,22 +219,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
@@ -250,22 +241,18 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
@@ -277,15 +264,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -294,7 +277,7 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
       .timestamp_type(this->type());
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
@@ -307,15 +290,11 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
   constexpr auto num_rows = 100;
   column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -324,12 +303,12 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
       .timestamp_type(this->type());
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, MultiColumn)
 {
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 10;
 
   auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -351,29 +330,29 @@ TEST_F(OrcWriterTest, MultiColumn)
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), validity};
   column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal");
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  cols.push_back(col6.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(7, expected->num_columns());
+  cudf::test::lists_column_wrapper<int64_t> col7{
+    {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
+
+  auto child_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{48, 27, 25, 31, 351, 351, 29, 15, -1, -99};
+  auto col8 = cudf::test::structs_column_wrapper{child_col};
+
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal");
+  expected_metadata.column_metadata[7].set_name("lists");
+  expected_metadata.column_metadata[8].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -381,13 +360,13 @@ TEST_F(OrcWriterTest, MultiColumn)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, MultiColumnWithNulls)
 {
-  constexpr auto num_rows = 100;
+  constexpr auto num_rows = 10;
 
   auto col0_data = random_values<bool>(num_rows);
   auto col1_data = random_values<int8_t>(num_rows);
@@ -402,14 +381,14 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 2); });
   auto col2_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   auto col3_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i == (num_rows - 1)); });
   auto col4_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 40 && i <= 60); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i >= 4 && i <= 6); });
   auto col5_mask =
-    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 80); });
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i > 8); });
   auto col6_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 3); });
 
@@ -420,30 +399,28 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
   column_wrapper<numeric::decimal64> col6{col6_data, col6_data + num_rows, col6_mask};
-
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("bools");
-  expected_metadata.column_names.emplace_back("int8s");
-  expected_metadata.column_names.emplace_back("int16s");
-  expected_metadata.column_names.emplace_back("int32s");
-  expected_metadata.column_names.emplace_back("floats");
-  expected_metadata.column_names.emplace_back("doubles");
-  expected_metadata.column_names.emplace_back("decimal");
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  cols.push_back(col6.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(7, expected->num_columns());
+  cudf::test::lists_column_wrapper<int32_t> col7{
+    {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
+    col0_mask};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15, -1, -99}, {1, 0, 1, 1, 0, 1, 1, 1, 0, 1}};
+  auto col8 = cudf::test::structs_column_wrapper{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
+  table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("bools");
+  expected_metadata.column_metadata[1].set_name("int8s");
+  expected_metadata.column_metadata[2].set_name("int16s");
+  expected_metadata.column_metadata[3].set_name("int32s");
+  expected_metadata.column_metadata[4].set_name("floats");
+  expected_metadata.column_metadata[5].set_name("doubles");
+  expected_metadata.column_metadata[6].set_name("decimal");
+  expected_metadata.column_metadata[7].set_name("lists");
+  expected_metadata.column_metadata[8].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -451,8 +428,8 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, ReadZeroRows)
@@ -463,15 +440,11 @@ TEST_F(OrcWriterTest, ReadZeroRows)
   constexpr auto num_rows = 10;
   column_wrapper<int64_t, typename decltype(sequence)::value_type> col(
     sequence, sequence + num_rows, validity);
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   cudf_io::orc_reader_options in_opts =
@@ -498,21 +471,16 @@ TEST_F(OrcWriterTest, Strings)
   column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
+  table_view expected({col0, col1, col2});
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(3, expected->num_columns());
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
 
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -520,8 +488,8 @@ TEST_F(OrcWriterTest, Strings)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, SlicedTable)
@@ -545,21 +513,24 @@ TEST_F(OrcWriterTest, SlicedTable)
   column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
   column_wrapper<float> col3{seq_col3, seq_col3 + num_rows, validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
-  expected_metadata.column_names.emplace_back("col_string");
-  expected_metadata.column_names.emplace_back("col_another");
-  expected_metadata.column_names.emplace_back("col_decimal");
+  using lcw = cudf::test::lists_column_wrapper<int64_t>;
+  lcw col4{{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col0.release());
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(4, expected->num_columns());
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int16_t>{
+    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
+  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
 
-  auto expected_slice = cudf::slice(expected->view(), {2, static_cast<cudf::size_type>(num_rows)});
+  table_view expected({col0, col1, col2, col3, col4, col5});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+  expected_metadata.column_metadata[3].set_name("col_decimal");
+  expected_metadata.column_metadata[4].set_name("lists");
+  expected_metadata.column_metadata[5].set_name("structs");
+
+  auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows)});
 
   auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
   cudf_io::orc_writer_options out_opts =
@@ -572,7 +543,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, HostBuffer)
@@ -583,17 +554,14 @@ TEST_F(OrcWriterTest, HostBuffer)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
   column_wrapper<int> col{seq_col.begin(), seq_col.end(), validity};
 
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_other");
+  table_view expected{{col}};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  const auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
 
   std::vector<char> out_buffer;
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected)
       .metadata(&expected_metadata);
   cudf_io::write_orc(out_opts);
 
@@ -602,8 +570,8 @@ TEST_F(OrcWriterTest, HostBuffer)
       .use_index(false);
   const auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcWriterTest, negTimestampsNano)
@@ -618,15 +586,11 @@ TEST_F(OrcWriterTest, negTimestampsNano)
     -1530705634500000000,
     -1674638741932929000,
   };
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(timestamps_ns.release());
-  auto expected = std::make_unique<table>(std::move(cols));
-  EXPECT_EQ(1, expected->num_columns());
+  table_view expected({timestamps_ns});
 
   auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
 
   cudf_io::write_orc(out_opts);
 
@@ -634,10 +598,9 @@ TEST_F(OrcWriterTest, negTimestampsNano)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view().column(0),
-                                 result.tbl->view().column(0),
-                                 cudf::test::debug_output_level::ALL_ERRORS);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
 TEST_F(OrcWriterTest, Slice)
@@ -747,21 +710,51 @@ TEST_F(OrcChunkedWriterTest, ManyTables)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
 
-TEST_F(OrcChunkedWriterTest, Strings)
+TEST_F(OrcChunkedWriterTest, Metadata)
 {
-  std::vector<std::unique_ptr<cudf::column>> cols;
+  std::vector<const char*> strings{
+    "Monday", "Tuesday", "THURSDAY", "Wednesday", "Friday", "Sunday", "Saturday"};
+  const auto num_rows = strings.size();
+
+  auto seq_col0 = random_values<int>(num_rows);
+  auto seq_col2 = random_values<float>(num_rows);
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  column_wrapper<int> col0{seq_col0.begin(), seq_col0.end(), validity};
+  column_wrapper<cudf::string_view> col1{strings.begin(), strings.end()};
+  column_wrapper<float> col2{seq_col2.begin(), seq_col2.end(), validity};
 
+  table_view expected({col0, col1, col2});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("col_other");
+  expected_metadata.column_metadata[1].set_name("col_string");
+  expected_metadata.column_metadata[2].set_name("col_another");
+
+  auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
+  cudf_io::chunked_orc_writer_options opts =
+    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath})
+      .metadata(&expected_metadata);
+  cudf_io::orc_chunked_writer(opts).write(expected).write(expected);
+
+  cudf_io::orc_reader_options read_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
+  auto result = cudf_io::read_orc(read_opts);
+
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcChunkedWriterTest, Strings)
+{
   bool mask1[] = {1, 1, 0, 1, 1, 1, 1};
   std::vector<const char*> h_strings1{"four", "score", "and", "seven", "years", "ago", "abcdefgh"};
   cudf::test::strings_column_wrapper strings1(h_strings1.begin(), h_strings1.end(), mask1);
-  cols.push_back(strings1.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({strings1});
 
   bool mask2[] = {0, 1, 1, 1, 1, 1, 1};
   std::vector<const char*> h_strings2{"ooooo", "ppppppp", "fff", "j", "cccc", "bbb", "zzzzzzzzzzz"};
   cudf::test::strings_column_wrapper strings2(h_strings2.begin(), h_strings2.end(), mask2);
-  cols.push_back(strings2.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({strings2});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -864,7 +857,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   using T = TypeParam;
 
   int num_els = 31;
-  std::vector<std::unique_ptr<cudf::column>> cols;
 
   bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -875,9 +867,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   std::fill(c1b, c1b + num_els, static_cast<T>(6));
   column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
   column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  cols.push_back(c1a_w.release());
-  cols.push_back(c1b_w.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({c1a_w, c1b_w});
 
   T c2a[num_els];
   std::fill(c2a, c2a + num_els, static_cast<T>(8));
@@ -885,9 +875,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   std::fill(c2b, c2b + num_els, static_cast<T>(9));
   column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
   column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  cols.push_back(c2a_w.release());
-  cols.push_back(c2b_w.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -911,7 +899,6 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   using T = TypeParam;
 
   int num_els = 33;
-  std::vector<std::unique_ptr<cudf::column>> cols;
 
   bool mask[] = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
@@ -922,9 +909,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   std::fill(c1b, c1b + num_els, static_cast<T>(6));
   column_wrapper<T> c1a_w(c1a, c1a + num_els, mask);
   column_wrapper<T> c1b_w(c1b, c1b + num_els, mask);
-  cols.push_back(c1a_w.release());
-  cols.push_back(c1b_w.release());
-  cudf::table tbl1(std::move(cols));
+  table_view tbl1({c1a_w, c1b_w});
 
   T c2a[num_els];
   std::fill(c2a, c2a + num_els, static_cast<T>(8));
@@ -932,9 +917,7 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   std::fill(c2b, c2b + num_els, static_cast<T>(9));
   column_wrapper<T> c2a_w(c2a, c2a + num_els, mask);
   column_wrapper<T> c2b_w(c2b, c2b + num_els, mask);
-  cols.push_back(c2a_w.release());
-  cols.push_back(c2b_w.release());
-  cudf::table tbl2(std::move(cols));
+  table_view tbl2({c2a_w, c2b_w});
 
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
@@ -981,18 +964,12 @@ TEST_F(OrcStatisticsTest, Basic)
     sequence, sequence + num_rows, valid_all);
   column_wrapper<cudf::timestamp_s, typename decltype(sequence)::value_type> col5(
     sequence, sequence + num_rows, validity);
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col1.release());
-  cols.push_back(col2.release());
-  cols.push_back(col3.release());
-  cols.push_back(col4.release());
-  cols.push_back(col5.release());
-  auto expected = std::make_unique<table>(std::move(cols));
+  table_view expected({col1, col2, col3, col4, col5});
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
 
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
   cudf_io::write_orc(out_opts);
 
   auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
@@ -1056,17 +1033,14 @@ TEST_F(OrcWriterTest, SlicedValidMask)
 
   column_wrapper<cudf::string_view> col{strings.begin(), strings.end(), validity};
 
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-
-  cudf_io::table_metadata expected_metadata;
-  expected_metadata.column_names.emplace_back("col_string");
-
   // Bug tested here is easiest to reproduce when column_offset % 32 is 31
   std::vector<cudf::size_type> indices{31, 34};
-  std::vector<cudf::column_view> sliced_col = cudf::slice(cols[0]->view(), indices);
+  auto sliced_col = cudf::slice(static_cast<cudf::column_view>(col), indices);
   cudf::table_view tbl{sliced_col};
 
+  cudf_io::table_input_metadata expected_metadata(tbl);
+  expected_metadata.column_metadata[0].set_name("col_string");
+
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
   cudf_io::orc_writer_options out_opts =
     cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl)
@@ -1078,7 +1052,7 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   auto result = cudf_io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
-  EXPECT_EQ(expected_metadata.column_names, result.metadata.column_names);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(OrcReaderTest, SingleInputs)
@@ -1087,9 +1061,9 @@ TEST_F(OrcReaderTest, SingleInputs)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf_io::chunked_orc_writer_options opts1 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1});
-  cudf_io::orc_chunked_writer(opts1).write(*table1);
+  cudf_io::orc_writer_options write_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
+  cudf_io::write_orc(write_opts);
 
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1}});
@@ -1106,15 +1080,19 @@ TEST_F(OrcReaderTest, MultipleInputs)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
-  auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf_io::chunked_orc_writer_options opts1 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath1});
-  cudf_io::orc_chunked_writer(opts1).write(*table1);
+  auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
+  {
+    cudf_io::orc_writer_options out_opts =
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
+    cudf_io::write_orc(out_opts);
+  }
 
-  auto filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
-  cudf_io::chunked_orc_writer_options opts2 =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath2});
-  cudf_io::orc_chunked_writer(opts2).write(*table2);
+  auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
+  {
+    cudf_io::orc_writer_options out_opts =
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath2}, table2->view());
+    cudf_io::write_orc(out_opts);
+  }
 
   cudf_io::orc_reader_options read_opts =
     cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1, filepath2}});
@@ -1139,14 +1117,11 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
   column_wrapper<numeric::decimal64> col{data, data + num_rows, mask};
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto tbl = std::make_unique<table>(std::move(cols));
+  cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
 
   cudf_io::write_orc(out_opts);
 
@@ -1154,7 +1129,7 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
     cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
   auto result = cudf_io::read_orc(in_opts);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl->view().column(0), result.tbl->view().column(0));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0));
 }
 
 INSTANTIATE_TEST_CASE_P(OrcWriterTest,
@@ -1173,14 +1148,11 @@ TEST_F(OrcWriterTest, Decimal32)
   });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13 == 0; });
   column_wrapper<numeric::decimal32> col{data, data + num_rows, mask};
-
-  std::vector<std::unique_ptr<column>> cols;
-  cols.push_back(col.release());
-  auto expected = std::make_unique<table>(std::move(cols));
+  cudf::table_view expected({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
   cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
 
   cudf_io::write_orc(out_opts);
 
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 7260aa9e686..0f59b0d5e15 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -184,25 +185,6 @@ std::unique_ptr<cudf::column> make_parquet_list_col(
                offsets_size, offsets.release(), std::move(child), 0, rmm::device_buffer{});
 }
 
-void compare_metadata_equality(cudf::io::table_input_metadata in_meta,
-                               cudf::io::table_metadata out_meta)
-{
-  std::function<void(cudf::io::column_name_info, cudf::io::column_in_metadata)> compare_names =
-    [&](cudf::io::column_name_info out_col, cudf::io::column_in_metadata in_col) {
-      if (not in_col.get_name().empty()) { EXPECT_EQ(out_col.name, in_col.get_name()); }
-      EXPECT_EQ(out_col.children.size(), in_col.num_children());
-      for (size_t i = 0; i < out_col.children.size(); ++i) {
-        compare_names(out_col.children[i], in_col.child(i));
-      }
-    };
-
-  EXPECT_EQ(out_meta.schema_info.size(), in_meta.column_metadata.size());
-
-  for (size_t i = 0; i < out_meta.schema_info.size(); ++i) {
-    compare_names(out_meta.schema_info[i], in_meta.column_metadata[i]);
-  }
-}
-
 // Base test fixture for tests
 struct ParquetWriterTest : public cudf::test::BaseFixture {
 };
@@ -444,7 +426,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiColumnWithNulls)
@@ -528,7 +510,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
   // Unfortunately the closest thing to a hierarchical schema is column_name_info which does not
   // have any tests for it c++ or python.
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, Strings)
@@ -568,7 +550,7 @@ TEST_F(ParquetWriterTest, Strings)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, SlicedTable)
@@ -682,7 +664,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListColumn)
@@ -780,7 +762,7 @@ TEST_F(ParquetWriterTest, ListColumn)
   auto result  = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -831,7 +813,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
   auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, HostBuffer)
@@ -860,7 +842,7 @@ TEST_F(ParquetWriterTest, HostBuffer)
   const auto result = cudf_io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, NonNullable)
@@ -989,7 +971,7 @@ TEST_F(ParquetWriterTest, StructOfList)
   const auto result = cudf_io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetWriterTest, ListOfStruct)
@@ -1044,7 +1026,7 @@ TEST_F(ParquetWriterTest, ListOfStruct)
   const auto result = cudf_io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 // custom data sink that supports device writes. uses plain file io.
@@ -1433,7 +1415,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
@@ -1526,7 +1508,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 
   // We specifically mentioned in input schema that struct_2 is non-nullable across chunked calls.
   auto result_parent_list = result.tbl->get_column(0);
@@ -1697,7 +1679,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ForcedNullability)
@@ -1830,7 +1812,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto result = cudf_io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
-  compare_metadata_equality(expected_metadata, result.metadata);
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
 TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
@@ -2552,7 +2534,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 
   {  // Test selecting a non-leaf and expecting all hierarchy from that node onwards
@@ -2581,7 +2563,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(0).child(1).set_name("age");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 
   {  // Test selecting struct children out of order
@@ -2616,7 +2598,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     expected_metadata.column_metadata[0].child(1).set_name("human?");
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
-    compare_metadata_equality(expected_metadata, result.metadata);
+    cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
   }
 }
 
diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
index 238e0b61fd9..85443c3ae0f 100644
--- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -20,8 +20,6 @@
 
 public class ORCWriterOptions extends CompressedMetadataWriterOptions {
 
-  public static ORCWriterOptions DEFAULT = new ORCWriterOptions(new Builder());
-
   private ORCWriterOptions(Builder builder) {
     super(builder);
   }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 2744728fb44..0af02d1c926 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1147,7 +1147,11 @@ public static TableWriter writeORCChunked(ORCWriterOptions options, HostBufferCo
    */
   @Deprecated
   public void writeORC(File outputFile) {
-    writeORC(ORCWriterOptions.DEFAULT, outputFile);
+    // Need to specify the number of columns but leave all column names undefined
+    String[] names = new String[getNumberOfColumns()];
+    Arrays.fill(names, "");
+    ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(names).build();
+    writeORC(opts, outputFile);
   }
 
   /**
@@ -1157,6 +1161,7 @@ public void writeORC(File outputFile) {
    */
   @Deprecated
   public void writeORC(ORCWriterOptions options, File outputFile) {
+    assert options.getColumnNames().length == getNumberOfColumns() : "must specify names for all columns";
     try (TableWriter writer = Table.writeORCChunked(options, outputFile)) {
       writer.write(this);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 96dd02e5f2a..ee75112a2ed 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -736,6 +736,29 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   }
 }
 
+cudf::io::table_input_metadata createORCTableInputMetadata(JNIEnv *env,
+                                                           jobjectArray const &j_col_names,
+                                                           jbooleanArray const &j_col_nullability,
+                                                           jobjectArray const &j_metadata_keys,
+                                                           jobjectArray const &j_metadata_values) {
+  cudf::jni::native_jstringArray const col_names(env, j_col_names);
+  cudf::jni::native_jbooleanArray const col_nullability(env, j_col_nullability);
+  cudf::jni::native_jstringArray const meta_keys(env, j_metadata_keys);
+  cudf::jni::native_jstringArray const meta_values(env, j_metadata_values);
+
+  std::vector<std::string> const cpp_names = col_names.as_cpp_vector();
+  std::size_t const num_columns = cpp_names.size();
+  cudf::io::table_input_metadata metadata;
+  metadata.column_metadata.resize(cpp_names.size());
+  for (std::size_t i = 0; i < num_columns; i++) {
+    metadata.column_metadata[i].set_name(cpp_names[i]).set_nullability(col_nullability[i]);
+  }
+  for (int i = 0; i < meta_keys.size(); ++i) {
+    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
+  }
+  return metadata;
+}
+
 // Check that window parameters are valid.
 bool valid_window_parameters(native_jintArray const &values,
                              native_jpointerArray<cudf::aggregation> const &ops,
@@ -1500,19 +1523,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
-
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
+    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
+        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1542,20 +1554,10 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
-    cudf::jni::native_jstringArray col_names(env, j_col_names);
-    cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
     cudf::jni::native_jstring output_path(env, j_output_path);
 
-    auto d = col_nullability.data();
-    std::vector<bool> nullability(d, d + col_nullability.size());
-    table_metadata_with_nullability metadata;
-    metadata.column_nullable = nullability;
-    metadata.column_names = col_names.as_cpp_vector();
-    for (int i = 0; i < meta_keys.size(); ++i) {
-      metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-    }
+    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
+        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
@@ -1577,7 +1579,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view *tview_orig = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig);
   cudf::jni::native_orc_writer_handle *state =
       reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
 
@@ -1587,7 +1590,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   }
   try {
     cudf::jni::auto_set_device(env);
-    state->writer->write(*tview);
+    state->writer->write(tview);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index b69dce57180..0e7ac15a79e 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -6876,7 +6876,10 @@ void testArrowIPCWriteToBufferChunked() {
   void testORCWriteToBufferChunked() {
     try (Table table0 = getExpectedFileTable();
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-      try (TableWriter writer = Table.writeORCChunked(ORCWriterOptions.DEFAULT, consumer)) {
+      String[] colNames = new String[table0.getNumberOfColumns()];
+      Arrays.fill(colNames, "");
+      ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(colNames).build();
+      try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
         writer.write(table0);
@@ -6924,7 +6927,13 @@ void testORCWriteToFileWithColNames() throws IOException {
   void testORCWriteToFileUncompressed() throws IOException {
     File tempFileUncompressed = File.createTempFile("test-uncompressed", ".orc");
     try (Table table0 = getExpectedFileTable()) {
-      table0.writeORC(ORCWriterOptions.builder().withCompressionType(CompressionType.NONE).build(), tempFileUncompressed.getAbsoluteFile());
+      String[] colNames = new String[table0.getNumberOfColumns()];
+      Arrays.fill(colNames, "");
+      ORCWriterOptions opts = ORCWriterOptions.builder()
+              .withColumnNames(colNames)
+              .withCompressionType(CompressionType.NONE)
+              .build();
+      table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile());
       try (Table table2 = Table.readORC(tempFileUncompressed.getAbsoluteFile())) {
         assertTablesAreEqual(table0, table2);
       }
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d89af43028d..3036b000c5b 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -70,13 +70,13 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
         cudf_table_view.table_view get_table() except+
-        const cudf_io_types.table_metadata *get_metadata() except+
+        const cudf_io_types.table_input_metadata *get_metadata() except+
 
         # setter
         void set_compression(cudf_io_types.compression_type comp) except+
         void enable_statistics(bool val) except+
         void set_table(cudf_table_view.table_view tbl) except+
-        void set_metadata(cudf_io_types.table_metadata* meta) except+
+        void set_metadata(cudf_io_types.table_input_metadata* meta) except+
 
         @staticmethod
         orc_writer_options_builder builder(
@@ -94,7 +94,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except+
         orc_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *meta
+            cudf_io_types.table_input_metadata *meta
         ) except+
 
         orc_writer_options build() except+
@@ -107,7 +107,7 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
         cudf_table_view.table_view get_table() except+
-        const cudf_io_types.table_metadata_with_nullability *get_metadata(
+        const cudf_io_types.table_input_metadata *get_metadata(
         ) except+
 
         # setter
@@ -115,7 +115,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_statistics(bool val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(
-            cudf_io_types.table_metadata_with_nullability* meta
+            cudf_io_types.table_input_metadata* meta
         ) except+
 
         @staticmethod
@@ -133,7 +133,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_table_view.table_view tbl
         ) except+
         chunked_orc_writer_options_builder& metadata(
-            cudf_io_types.table_metadata *meta
+            cudf_io_types.table_input_metadata *meta
         ) except+
 
         chunked_orc_writer_options build() except+
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index e2053f8ce4f..81ca7e5836b 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -66,36 +66,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cudf_io_types.table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
-    cdef cppclass column_in_metadata:
-        column_in_metadata& set_name(const string& name)
-        column_in_metadata& set_nullability(bool nullable)
-        column_in_metadata& set_list_column_as_map()
-        column_in_metadata& set_int96_timestamps(bool req)
-        column_in_metadata& set_decimal_precision(uint8_t precision)
-        column_in_metadata& child(size_type i)
-
-    cdef cppclass table_input_metadata:
-        table_input_metadata() except +
-        table_input_metadata(const cudf_table_view.table_view& table) except +
-        table_input_metadata(
-            const cudf_table_view.table_view& table,
-            map[string, string] user_data
-        ) except +
-
-        vector[column_in_metadata] column_metadata
-        map[string, string] user_data
-
     cdef cppclass parquet_writer_options:
         parquet_writer_options() except +
         cudf_io_types.sink_info get_sink_info() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
-        const table_input_metadata get_metadata() except +
+        const cudf_io_types.table_input_metadata get_metadata() except +
         string get_column_chunks_file_path() except+
 
         void set_metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -121,7 +102,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_
         ) except +
         parquet_writer_options_builder& metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
@@ -147,11 +128,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.sink_info get_sink() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        table_input_metadata* get_metadata(
+        cudf_io_types.table_input_metadata* get_metadata(
         ) except+
 
         void set_metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
@@ -171,7 +152,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_io_types.sink_info sink_,
         ) except +
         chunked_parquet_writer_options_builder& metadata(
-            table_input_metadata *m
+            cudf_io_types.table_input_metadata *m
         ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 7fa6406bd29..721d90f1f5b 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -8,7 +9,9 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
+cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.table.table cimport table
+from cudf._lib.cpp.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
@@ -52,15 +55,29 @@ cdef extern from "cudf/io/types.hpp" \
         map[string, string] user_data
         vector[column_name_info] schema_info
 
-    cdef cppclass table_metadata_with_nullability(table_metadata):
-        table_metadata_with_nullability() except +
-
-        vector[bool] nullability
-
     cdef cppclass table_with_metadata:
         unique_ptr[table] tbl
         table_metadata metadata
 
+    cdef cppclass column_in_metadata:
+        column_in_metadata& set_name(const string& name)
+        column_in_metadata& set_nullability(bool nullable)
+        column_in_metadata& set_list_column_as_map()
+        column_in_metadata& set_int96_timestamps(bool req)
+        column_in_metadata& set_decimal_precision(uint8_t precision)
+        column_in_metadata& child(size_type i)
+
+    cdef cppclass table_input_metadata:
+        table_input_metadata() except +
+        table_input_metadata(const cudf_table_view.table_view& table) except +
+        table_input_metadata(
+            const cudf_table_view.table_view& table,
+            map[string, string] user_data
+        ) except +
+
+        vector[column_in_metadata] column_metadata
+        map[string, string] user_data
+
     cdef cppclass host_buffer:
         const char* data
         size_t size
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index bc4f4aee9cd..03d163b7638 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -23,13 +23,13 @@ from cudf._lib.cpp.io.orc_metadata cimport (
     read_raw_orc_statistics as libcudf_read_raw_orc_statistics,
 )
 from cudf._lib.cpp.io.types cimport (
+    column_in_metadata,
     column_name_info,
     compression_type,
     data_sink,
     sink_info,
     source_info,
-    table_metadata,
-    table_metadata_with_nullability,
+    table_input_metadata,
     table_with_metadata,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -50,7 +50,8 @@ import numpy as np
 
 from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
 
-from cudf._lib.utils import generate_pandas_metadata
+from cudf._lib.utils import _index_level_name, generate_pandas_metadata
+from cudf.api.types import is_list_dtype, is_struct_dtype
 
 
 cpdef read_raw_orc_statistics(filepath_or_buffer):
@@ -144,19 +145,35 @@ cpdef write_orc(Table table,
     cudf.read_orc
     """
     cdef compression_type compression_ = _get_comp_type(compression)
-    cdef table_metadata metadata_ = table_metadata()
     cdef unique_ptr[data_sink] data_sink_c
     cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-
-    metadata_.column_names.reserve(len(table._column_names))
-
-    for col_name in table._column_names:
-        metadata_.column_names.push_back(str.encode(col_name))
+    cdef unique_ptr[table_input_metadata] tbl_meta
+
+    if not isinstance(table._index, cudf.RangeIndex):
+        tv = table_view_from_table(table)
+        tbl_meta = make_unique[table_input_metadata](tv)
+        for level, idx_name in enumerate(table._index.names):
+            tbl_meta.get().column_metadata[level].set_name(
+                str.encode(
+                    _index_level_name(idx_name, level, table._column_names)
+                )
+            )
+        num_index_cols_meta = len(table._index.names)
+    else:
+        tv = table_view_from_table(table, ignore_index=True)
+        tbl_meta = make_unique[table_input_metadata](tv)
+        num_index_cols_meta = 0
+
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        tbl_meta.get().column_metadata[i].set_name(name.encode())
+        _set_col_children_names(
+            table[name]._column, tbl_meta.get().column_metadata[i]
+        )
 
     cdef orc_writer_options c_orc_writer_options = move(
         orc_writer_options.builder(
             sink_info_c, table_view_from_table(table, ignore_index=True)
-        ).metadata(&metadata_)
+        ).metadata(tbl_meta.get())
         .compression(compression_)
         .enable_statistics(<bool> (True if enable_statistics else False))
         .build()
@@ -231,6 +248,7 @@ cdef class ORCWriter:
     cdef bool enable_stats
     cdef compression_type comp_type
     cdef object index
+    cdef unique_ptr[table_input_metadata] tbl_meta
 
     def __cinit__(self, object path, object index=None,
                   object compression=None, bool enable_statistics=True):
@@ -268,20 +286,46 @@ cdef class ORCWriter:
         """
         Prepare all the values required to build the
         chunked_orc_writer_options anb creates a writer"""
-        cdef unique_ptr[table_metadata_with_nullability] tbl_meta
-        tbl_meta = make_unique[table_metadata_with_nullability]()
+        cdef table_view tv
 
         # Set the table_metadata
-        tbl_meta.get().column_names = get_column_names(table, self.index)
+        num_index_cols_meta = 0
+        self.tbl_meta = make_unique[table_input_metadata](
+            table_view_from_table(table, ignore_index=True)
+        )
+        if self.index is not False:
+            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+                tv = table_view_from_table(table)
+                self.tbl_meta = make_unique[table_input_metadata](tv)
+                for level, idx_name in enumerate(table._index.names):
+                    self.tbl_meta.get().column_metadata[level].set_name(
+                        (str.encode(idx_name))
+                    )
+                num_index_cols_meta = len(table._index.names)
+            else:
+                if table._index.name is not None:
+                    tv = table_view_from_table(table)
+                    self.tbl_meta = make_unique[table_input_metadata](tv)
+                    self.tbl_meta.get().column_metadata[0].set_name(
+                        str.encode(table._index.name)
+                    )
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.get().column_metadata[i].set_name(name.encode())
+            _set_col_children_names(
+                table[name]._column, self.tbl_meta.get().column_metadata[i]
+            )
+
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        tbl_meta.get().user_data[str.encode("pandas")] = \
+        self.tbl_meta.get().user_data[str.encode("pandas")] = \
             str.encode(pandas_metadata)
 
         cdef chunked_orc_writer_options args
         with nogil:
             args = move(
                 chunked_orc_writer_options.builder(self.sink)
-                .metadata(tbl_meta.get())
+                .metadata(self.tbl_meta.get())
                 .compression(self.comp_type)
                 .enable_statistics(self.enable_stats)
                 .build()
@@ -289,3 +333,15 @@ cdef class ORCWriter:
             self.writer.reset(new orc_chunked_writer(args))
 
         self.initialized = True
+
+cdef _set_col_children_names(Column col, column_in_metadata& col_meta):
+    if is_struct_dtype(col):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name.encode())
+            _set_col_children_names(child_col, col_meta.child(i))
+    elif is_list_dtype(col):
+        _set_col_children_names(col.children[1], col_meta.child(1))
+    else:
+        return
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d9017c7d6f8..70bdb6e2e60 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -45,15 +45,14 @@ from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
-    column_in_metadata,
     merge_rowgroup_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
     parquet_writer_options,
     read_parquet as parquet_reader,
-    table_input_metadata,
     write_parquet as parquet_writer,
 )
+from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 73fbd50c824..cc5e1909d67 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -338,11 +338,11 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
 
     for col in df._data.columns:
         if isinstance(col, cudf.core.column.StructColumn):
-            raise NotImplementedError(
-                "Writing to ORC format is not yet supported with "
-                "Struct columns."
+            warnings.warn(
+                "Support for writing tables with struct columns is "
+                "currently experimental."
             )
-        elif isinstance(col, cudf.core.column.CategoricalColumn):
+        if isinstance(col, cudf.core.column.CategoricalColumn):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
                 "Categorical columns."
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 2d4dc55bd28..61c2ff5ed36 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -58,7 +58,6 @@ def _make_path_or_buf(src):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings("ignore:Strings are not yet supported")
 @pytest.mark.parametrize("engine", ["pyarrow", "cudf"])
 @pytest.mark.parametrize("use_index", [False, True])
 @pytest.mark.parametrize(
@@ -221,6 +220,7 @@ def test_orc_read_statistics(datadir):
     assert_eq(file_statistics[0]["string1"]["minimum"], "one")
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 @pytest.mark.parametrize(
     "predicate,expected_len",
@@ -244,6 +244,7 @@ def test_orc_read_filtered(datadir, engine, predicate, expected_len):
     assert len(df_filtered) == expected_len
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
 @pytest.mark.parametrize("engine", ["cudf", "pyarrow"])
 def test_orc_read_stripes(datadir, engine):
     path = datadir / "TestOrcFile.testDate1900.orc"
@@ -558,7 +559,6 @@ def test_orc_reader_boolean_type(datadir, orc_file):
     assert_eq(pdf, df)
 
 
-@pytest.mark.filterwarnings("ignore:Using CPU")
 def test_orc_reader_tzif_timestamps(datadir):
     # Contains timstamps in the range covered by the TZif file
     # Other timedate tests only cover "future" times
@@ -954,7 +954,9 @@ def generate_list_struct_buff(size=100_000):
     return buff
 
 
-list_struct_buff = generate_list_struct_buff()
+@pytest.fixture(scope="module")
+def list_struct_buff():
+    return generate_list_struct_buff()
 
 
 @pytest.mark.parametrize(
@@ -967,9 +969,7 @@ def generate_list_struct_buff(size=100_000):
 )
 @pytest.mark.parametrize("num_rows", [0, 15, 1005, 10561, 100_000])
 @pytest.mark.parametrize("use_index", [True, False])
-def test_lists_struct_nests(
-    columns, num_rows, use_index,
-):
+def test_lists_struct_nests(columns, num_rows, use_index, list_struct_buff):
 
     gdf = cudf.read_orc(
         list_struct_buff,
@@ -993,7 +993,7 @@ def test_lists_struct_nests(
 
 
 @pytest.mark.parametrize("columns", [None, ["lvl1_struct"], ["lvl1_list"]])
-def test_skip_rows_for_nested_types(columns):
+def test_skip_rows_for_nested_types(columns, list_struct_buff):
     with pytest.raises(
         RuntimeError, match="skip_rows is not supported by nested column"
     ):
@@ -1379,3 +1379,45 @@ def test_names_in_struct_dtype_nesting(datadir):
     edf = cudf.DataFrame(expect.to_pandas())
     # test schema
     assert edf.dtypes.equals(got.dtypes)
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+def test_writer_lists_structs(list_struct_buff):
+    df_in = cudf.read_orc(list_struct_buff)
+
+    buff = BytesIO()
+    df_in.to_orc(buff)
+
+    pyarrow_tbl = pyarrow.orc.ORCFile(buff).read()
+
+    assert pyarrow_tbl.equals(df_in.to_arrow())
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "with_pd": [
+                [i if i % 3 else None] if i < 9999 or i > 20001 else None
+                for i in range(21000)
+            ],
+            "no_pd": [
+                [i if i % 3 else None] if i < 9999 or i > 20001 else []
+                for i in range(21000)
+            ],
+        },
+    ],
+)
+def test_orc_writer_lists_empty_rg(data):
+    pdf_in = pd.DataFrame(data)
+    buffer = BytesIO()
+    cudf_in = cudf.from_pandas(pdf_in)
+
+    cudf_in.to_orc(buffer)
+
+    df = cudf.read_orc(buffer)
+    assert_eq(df, cudf_in)
+
+    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    assert_eq(pdf_in, pdf_out)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 4bffd06c4cc..e23318eb999 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -391,6 +391,12 @@
 enable_statistics: boolean, default True
     Enable writing column statistics.
 
+
+Notes
+-----
+Support for writing tables with struct columns is currently experimental,
+the output may not be as reliable as writing for other datatypes.
+
 See Also
 --------
 cudf.read_orc

From eaedf17e80d287587a39cb7603fc637f8f5027ff Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Thu, 23 Sep 2021 05:35:57 +0530
Subject: [PATCH 18/26] Update nvcomp to include fixes for installation of
 headers (#9276)

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Keith Kraus (https://github.com/kkraus14)

URL: https://github.com/rapidsai/cudf/pull/9276
---
 cpp/cmake/thirdparty/get_nvcomp.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index cade101cbfd..16d50fd3388 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvcomp VERSION)
         GLOBAL_TARGETS     nvcomp::nvcomp
         CPM_ARGS
             GITHUB_REPOSITORY  NVIDIA/nvcomp
-            GIT_TAG            4f4e5713e69473be6e0c8ae483a932f666ae3c2f
+            GIT_TAG            aa003db89e052e4ce408910ff17e1054b7c43b7d
             OPTIONS            "BUILD_STATIC ON"
                                "BUILD_TESTS OFF"
                                "BUILD_BENCHMARKS OFF"

From ced66b568a837572daf5bc0a3c5c7b258cc4ba34 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 22 Sep 2021 21:34:27 -0500
Subject: [PATCH 19/26] Align `DataFrame.apply` signature with pandas (#9275)

Aligns the function signature for `cudf.DataFrame.apply` with that of `pandas.DataFrame.apply`. This is needed so that dask can build on a common `apply` interface between backends among other reasons.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9275
---
 python/cudf/cudf/core/dataframe.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1143f85a4e6..901bdfe42c8 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4742,7 +4742,9 @@ def query(self, expr, local_dict=None):
             boolmask = queryutils.query_execute(self, expr, callenv)
             return self._apply_boolean_mask(boolmask)
 
-    def apply(self, func, axis=1):
+    def apply(
+        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+    ):
         """
         Apply a function along an axis of the DataFrame.
 
@@ -4756,12 +4758,17 @@ def apply(self, func, axis=1):
         ----------
         func : function
             Function to apply to each row.
-
         axis : {0 or 'index', 1 or 'columns'}, default 0
             Axis along which the function is applied:
             * 0 or 'index': apply function to each column.
               Note: axis=0 is not yet supported.
             * 1 or 'columns': apply function to each row.
+        raw: bool, default False
+            Not yet supported
+        result_type: {'expand', 'reduce', 'broadcast', None}, default None
+            Not yet supported
+        args: tuple
+            Not yet supported
 
         Examples
         --------
@@ -4910,6 +4917,12 @@ def apply(self, func, axis=1):
             raise ValueError(
                 "DataFrame.apply currently only supports row wise ops"
             )
+        if raw:
+            raise ValueError("The `raw` kwarg is not yet supported.")
+        if result_type is not None:
+            raise ValueError("The `result_type` kwarg is not yet supported.")
+        if args or kwargs:
+            raise ValueError("args and kwargs are not yet supported.")
 
         return cudf.Series(func(self))
 

From c43165091c85fd698937cefaab63962c00566a1b Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 24 Sep 2021 00:49:56 +0530
Subject: [PATCH 20/26] Revert "Add shallow hash function and shallow equality
 comparison for column_view (#9185)" (#9283)

Reverts rapidsai/cudf#9185

More details on PR https://github.com/rapidsai/cudf/pull/9185

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Devavret Makkar (https://github.com/devavret)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/cudf/pull/9283
---
 cpp/include/cudf/column/column_view.hpp       |  41 --
 cpp/include/cudf/detail/hashing.hpp           |  36 --
 .../cudf/detail/utilities/hash_functions.cuh  |  12 -
 cpp/include/cudf_test/type_lists.hpp          |  12 -
 cpp/src/column/column_view.cpp                |  55 ---
 cpp/tests/CMakeLists.txt                      |   1 -
 cpp/tests/column/column_view_shallow_test.cpp | 442 ------------------
 7 files changed, 599 deletions(-)
 delete mode 100644 cpp/tests/column/column_view_shallow_test.cpp

diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index cd490c3c832..7feaeafbad0 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -633,45 +633,4 @@ column_view bit_cast(column_view const& input, data_type type);
  */
 mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
-namespace detail {
-/**
- * @brief Computes a hash value from the shallow state of the specified column
- *
- * For any two columns, if `is_shallow_equivalent(c0,c1)` then `shallow_hash(c0) ==
- * shallow_hash(c1)`.
- *
- * The complexity of computing the hash value of `input` is `O( count_descendants(input) )`, i.e.,
- * it is independent of the number of elements in the column.
- *
- * This function does _not_ inspect the elements of `input` nor access any device memory or launch
- * any kernels.
- *
- * @param input The `column_view` to compute hash
- * @return The hash value derived from the shallow state of `input`.
- */
-std::size_t shallow_hash(column_view const& input);
-
-/**
- * @brief Uses only shallow state to determine if two `column_view`s view equivalent columns
- *
- *  Two columns are equivalent if for any operation `F` then:
- *   ```
- *    is_shallow_equivalent(c0, c1) ==> The results of F(c0) and F(c1) are equivalent
- *   ```
- * For any two non-empty columns, `is_shallow_equivalent(c0,c1)` is true only if they view the exact
- * same physical column. In other words, two physically independent columns may have exactly
- * equivalent elements but their shallow state would not be equivalent.
- *
- * The complexity of this function is `O( min(count_descendants(lhs), count_descendants(rhs)) )`,
- * i.e., it is independent of the number of elements in either column.
- *
- * This function does _not_ inspect the elements of `lhs` or `rhs` nor access any device memory nor
- * launch any kernels.
- *
- * @param lhs The left `column_view` to compare
- * @param rhs The right `column_view` to compare
- * @return If `lhs` and `rhs` have equivalent shallow state
- */
-bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
-}  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index bd5c8a42a51..83d6be14709 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -19,9 +19,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <cstddef>
-#include <functional>
-
 namespace cudf {
 namespace detail {
 
@@ -56,38 +53,5 @@ std::unique_ptr<column> serial_murmur_hash3_32(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/* Copyright 2005-2014 Daniel James.
- *
- * Use, modification and distribution is subject to the Boost Software
- * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- */
-/**
- * @brief Combines two hashed values into a single hashed value.
- *
- * Adapted from Boost hash_combine function, modified for 64-bit
- * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
- *
- * @param lhs The first hashed value
- * @param rhs The second hashed value
- * @return Combined hash value
- */
-constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
-{
-  lhs ^= rhs + 0x9e3779b97f4a7c15 + (lhs << 6) + (lhs >> 2);
-  return lhs;
-}
 }  // namespace detail
 }  // namespace cudf
-
-// specialization of std::hash for cudf::data_type
-namespace std {
-template <>
-struct hash<cudf::data_type> {
-  std::size_t operator()(cudf::data_type const& type) const noexcept
-  {
-    return cudf::detail::hash_combine(std::hash<int32_t>{}(static_cast<int32_t>(type.id())),
-                                      std::hash<int32_t>{}(type.scale()));
-  }
-};
-}  // namespace std
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 65deadd6cd0..6eab13ae9af 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -395,12 +395,6 @@ struct MurmurHash3_32 {
     return h;
   }
 
-  /* Copyright 2005-2014 Daniel James.
-   *
-   * Use, modification and distribution is subject to the Boost Software
-   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-   * http://www.boost.org/LICENSE_1_0.txt)
-   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
@@ -801,12 +795,6 @@ struct IdentityHash {
   IdentityHash()    = default;
   constexpr IdentityHash(uint32_t seed) : m_seed(seed) {}
 
-  /* Copyright 2005-2014 Daniel James.
-   *
-   * Use, modification and distribution is subject to the Boost Software
-   * License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
-   * http://www.boost.org/LICENSE_1_0.txt)
-   */
   /**
    * @brief  Combines two hash values into a new single hash value. Called
    * repeatedly to create a hash value from several variables.
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 982c94ac402..74688b7f133 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -315,18 +315,6 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
  */
 using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
 
-/**
- * @brief Provides a list of all compound types for use in GTest typed tests.
- *
- * Example:
- * ```
- * // Invokes all typed fixture tests for all compound types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
- * ```
- */
-using CompoundTypes =
-  cudf::test::Types<cudf::string_view, cudf::dictionary32, cudf::list_view, cudf::struct_view>;
-
 /**
  * @brief Provides a list of all types supported in libcudf for use in a GTest
  * typed test.
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 5749cb48c0e..186669ae697 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,7 +15,6 @@
  */
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/hashing.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -23,7 +22,6 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
-#include <algorithm>
 #include <exception>
 #include <numeric>
 #include <vector>
@@ -78,59 +76,6 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
            ? 0
            : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end);
 }
-
-// Struct to use custom hash combine and fold expression
-struct HashValue {
-  std::size_t hash;
-  explicit HashValue(std::size_t h) : hash{h} {}
-  HashValue operator^(HashValue const& other) const
-  {
-    return HashValue{hash_combine(hash, other.hash)};
-  }
-};
-
-template <typename... Ts>
-constexpr auto hash(Ts&&... ts)
-{
-  return (... ^ HashValue(std::hash<Ts>{}(ts))).hash;
-}
-
-std::size_t shallow_hash_impl(column_view const& c, bool is_parent_empty = false)
-{
-  std::size_t const init = (is_parent_empty or c.is_empty())
-                             ? hash(c.type(), 0)
-                             : hash(c.type(), c.size(), c.head(), c.null_mask(), c.offset());
-  return std::accumulate(c.child_begin(),
-                         c.child_end(),
-                         init,
-                         [&c, is_parent_empty](std::size_t hash, auto const& child) {
-                           return hash_combine(
-                             hash, shallow_hash_impl(child, c.is_empty() or is_parent_empty));
-                         });
-}
-
-std::size_t shallow_hash(column_view const& input) { return shallow_hash_impl(input); }
-
-bool shallow_equivalent_impl(column_view const& lhs,
-                             column_view const& rhs,
-                             bool is_parent_empty = false)
-{
-  bool const is_empty = (lhs.is_empty() and rhs.is_empty()) or is_parent_empty;
-  return (lhs.type() == rhs.type()) and
-         (is_empty or ((lhs.size() == rhs.size()) and (lhs.head() == rhs.head()) and
-                       (lhs.null_mask() == rhs.null_mask()) and (lhs.offset() == rhs.offset()))) and
-         std::equal(lhs.child_begin(),
-                    lhs.child_end(),
-                    rhs.child_begin(),
-                    rhs.child_end(),
-                    [is_empty](auto const& lhs_child, auto const& rhs_child) {
-                      return shallow_equivalent_impl(lhs_child, rhs_child, is_empty);
-                    });
-}
-bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
-{
-  return shallow_equivalent_impl(lhs, rhs);
-}
 }  // namespace detail
 
 // Immutable view constructor
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index cde170fb598..03f7967cee0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -33,7 +33,6 @@ endfunction()
 # - column tests ----------------------------------------------------------------------------------
 ConfigureTest(COLUMN_TEST
     column/bit_cast_test.cpp
-    column/column_view_shallow_test.cpp
     column/column_test.cu
     column/column_device_view_test.cu
     column/compound_test.cu)
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
deleted file mode 100644
index f76f682bb2f..00000000000
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <memory>
-#include <type_traits>
-
-// fixed_width, dict, string, list, struct
-template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-std::unique_ptr<cudf::column> example_column()
-{
-  auto begin = thrust::make_counting_iterator(1);
-  auto end   = thrust::make_counting_iterator(16);
-  return cudf::test::fixed_width_column_wrapper<T>(begin, end).release();
-}
-
-template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-std::unique_ptr<cudf::column> example_column()
-{
-  return cudf::test::dictionary_column_wrapper<std::string>(
-           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0})
-    .release();
-}
-
-template <typename T,
-          std::enable_if_t<std::is_same_v<T, std::string> or
-                           std::is_same_v<T, cudf::string_view>>* = nullptr>
-std::unique_ptr<cudf::column> example_column()
-
-{
-  return cudf::test::strings_column_wrapper(
-           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""})
-    .release();
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, cudf::list_view>>* = nullptr>
-std::unique_ptr<cudf::column> example_column()
-{
-  return cudf::test::lists_column_wrapper<int>({{1, 2, 3}, {4, 5}, {}, {6, 7, 8}}).release();
-}
-
-template <typename T, std::enable_if_t<std::is_same_v<T, cudf::struct_view>>* = nullptr>
-std::unique_ptr<cudf::column> example_column()
-{
-  auto begin    = thrust::make_counting_iterator(1);
-  auto end      = thrust::make_counting_iterator(16);
-  auto member_0 = cudf::test::fixed_width_column_wrapper<int32_t>(begin, end);
-  auto member_1 = cudf::test::fixed_width_column_wrapper<int32_t>(begin + 10, end + 10);
-  return cudf::test::structs_column_wrapper({member_0, member_1}).release();
-}
-
-template <typename T>
-struct ColumnViewShallowTests : public cudf::test::BaseFixture {
-};
-
-using AllTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::CompoundTypes>;
-TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes);
-
-// Test for fixed_width, dict, string, list, struct
-// column_view, column_view = same hash.
-// column_view, make a copy = same hash.
-// new column_view from colmn = same hash
-// column_view, copy column = diff hash
-// column_view, diff column = diff hash.
-//
-// column_view old, update data + new column_view     = same hash.
-// column_view old, add null_mask + new column_view   = diff hash.
-// column_view old, update nulls + new column_view    = same hash.
-// column_view old, set_null_count + new column_view  = same hash.
-//
-// column_view, sliced[0, size) = same hash (for split too)
-// column_view, sliced[n:)      = diff hash (for split too)
-// column_view, bit_cast        = diff hash
-//
-// mutable_column_view, column_view = same hash
-// mutable_column_view, modified mutable_column_view = same hash
-//
-// update the children column data  = same hash
-// update the children column_views = diff hash
-
-TYPED_TEST(ColumnViewShallowTests, shallow_hash_basic)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // same = same hash
-  {
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view));
-  }
-  // copy column_view = same hash
-  {
-    auto col_view_copy = col_view;
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_copy));
-  }
-
-  // new column_view from column = same hash
-  {
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
-  }
-
-  // copy column = diff hash
-  {
-    auto col_new       = std::make_unique<cudf::column>(*col);
-    auto col_view_copy = col_new->view();
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_copy));
-  }
-
-  // column_view, diff column = diff hash.
-  {
-    auto col_diff      = example_column<TypeParam>();
-    auto col_view_diff = cudf::column_view{*col_diff};
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_diff));
-  }
-}
-TYPED_TEST(ColumnViewShallowTests, shallow_hash_update_data)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // update data + new column_view = same hash.
-  {
-    // update data by modifying some bits: fixed_width, string, dict, list, struct
-    if constexpr (cudf::is_fixed_width<TypeParam>()) {
-      // Update data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
-      cudf::set_null_mask(data, 2, 64, true);
-    } else {
-      // Update child(0).data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
-      cudf::set_null_mask(data, 2, 64, true);
-    }
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
-  }
-  // add null_mask + new column_view = diff hash.
-  {
-    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
-    col_view_new.null_count();
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_view_new));
-    auto col_view_new2 = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view_new), shallow_hash(col_view_new2));
-  }
-  col_view = cudf::column_view{*col};  // updating after adding null_mask
-  // update nulls + new column_view = same hash.
-  {
-    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
-  }
-  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
-  {
-    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new));
-    col->set_null_count(col->size());
-    auto col_view_new2 = cudf::column_view{*col};
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_view_new2));
-  }
-}
-
-TYPED_TEST(ColumnViewShallowTests, shallow_hash_slice)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // column_view, sliced[0, size)  = same hash (for split too)
-  {
-    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_sliced[0]));
-    auto col_split = cudf::split(col_view, {0});
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_split[1]));
-  }
-  // column_view, sliced[n:]       = diff hash (for split too)
-  {
-    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_sliced[0]));
-    auto col_split = cudf::split(col_view, {1});
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[0]));
-    EXPECT_NE(shallow_hash(col_view), shallow_hash(col_split[1]));
-  }
-  // column_view, col copy sliced[0, 0)  = same hash (empty column)
-  {
-    auto col_new        = std::make_unique<cudf::column>(*col);
-    auto col_new_view   = col_new->view();
-    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
-    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
-
-    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_sliced[1]));
-    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_sliced[2]));
-    EXPECT_EQ(shallow_hash(col_sliced[0]), shallow_hash(col_new_sliced[0]));
-    EXPECT_EQ(shallow_hash(col_sliced[1]), shallow_hash(col_new_sliced[1]));
-    EXPECT_EQ(shallow_hash(col_sliced[2]), shallow_hash(col_new_sliced[2]));
-  }
-
-  // column_view, bit_cast         = diff hash
-  {
-    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
-      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
-                                         std::make_unsigned_t<TypeParam>,
-                                         std::make_signed_t<TypeParam>>;
-      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
-      auto col_bitcast = cudf::bit_cast(col_view, new_type);
-      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_bitcast));
-    }
-  }
-}
-
-TYPED_TEST(ColumnViewShallowTests, shallow_hash_mutable)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // mutable_column_view, column_view = same hash
-  {
-    auto col_mutable = cudf::mutable_column_view{*col};
-    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_view));
-  }
-  // mutable_column_view, modified mutable_column_view = same hash
-  // update the children column data = same hash
-  {
-    auto col_mutable = cudf::mutable_column_view{*col};
-    if constexpr (cudf::is_fixed_width<TypeParam>()) {
-      // Update data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
-      cudf::set_null_mask(data, 1, 32, false);
-    } else {
-      // Update child(0).data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
-      cudf::set_null_mask(data, 1, 32, false);
-    }
-    EXPECT_EQ(shallow_hash(col_view), shallow_hash(col_mutable));
-    auto col_mutable_new = cudf::mutable_column_view{*col};
-    EXPECT_EQ(shallow_hash(col_mutable), shallow_hash(col_mutable_new));
-  }
-  // update the children column_views = diff hash
-  {
-    if constexpr (cudf::is_nested<TypeParam>()) {
-      col->child(0).set_null_mask(
-        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
-      auto col_child_updated = cudf::mutable_column_view{*col};
-      EXPECT_NE(shallow_hash(col_view), shallow_hash(col_child_updated));
-    }
-  }
-}
-
-TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_basic)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // same = same hash
-  {
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view));
-  }
-  // copy column_view = same hash
-  {
-    auto col_view_copy = col_view;
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_copy));
-  }
-
-  // new column_view from column = same hash
-  {
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
-  }
-
-  // copy column = diff hash
-  {
-    auto col_new       = std::make_unique<cudf::column>(*col);
-    auto col_view_copy = col_new->view();
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_copy));
-  }
-
-  // column_view, diff column = diff hash.
-  {
-    auto col_diff      = example_column<TypeParam>();
-    auto col_view_diff = cudf::column_view{*col_diff};
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_diff));
-  }
-}
-TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_update_data)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // update data + new column_view = same hash.
-  {
-    // update data by modifying some bits: fixed_width, string, dict, list, struct
-    if constexpr (cudf::is_fixed_width<TypeParam>()) {
-      // Update data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
-      cudf::set_null_mask(data, 2, 64, true);
-    } else {
-      // Update child(0).data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
-      cudf::set_null_mask(data, 2, 64, true);
-    }
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
-  }
-  // add null_mask + new column_view = diff hash.
-  {
-    col->set_null_mask(cudf::create_null_mask(col->size(), cudf::mask_state::ALL_VALID));
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
-    col_view_new.null_count();
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_view_new));
-    auto col_view_new2 = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view_new, col_view_new2));
-  }
-  col_view = cudf::column_view{*col};  // updating after adding null_mask
-  // update nulls + new column_view = same hash.
-  {
-    cudf::set_null_mask(col->mutable_view().null_mask(), 2, 4, false);
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
-  }
-  // set_null_count + new column_view = same hash. set_null_count(UNKNOWN_NULL_COUNT)
-  {
-    col->set_null_count(cudf::UNKNOWN_NULL_COUNT);
-    auto col_view_new = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new));
-    col->set_null_count(col->size());
-    auto col_view_new2 = cudf::column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_view_new2));
-  }
-}
-
-TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_slice)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // column_view, sliced[0, size)  = same hash (for split too)
-  {
-    auto col_sliced = cudf::slice(col_view, {0, col_view.size()});
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_sliced[0]));
-    auto col_split = cudf::split(col_view, {0});
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_split[1]));
-  }
-  // column_view, sliced[n:]       = diff hash (for split too)
-  {
-    auto col_sliced = cudf::slice(col_view, {1, col_view.size()});
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_sliced[0]));
-    auto col_split = cudf::split(col_view, {1});
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[0]));
-    EXPECT_FALSE(is_shallow_equivalent(col_view, col_split[1]));
-  }
-  // column_view, col copy sliced[0, 0)  = same hash (empty column)
-  {
-    auto col_new        = std::make_unique<cudf::column>(*col);
-    auto col_new_view   = col_new->view();
-    auto col_sliced     = cudf::slice(col_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
-    auto col_new_sliced = cudf::slice(col_new_view, {0, 0, 1, 1, col_view.size(), col_view.size()});
-
-    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_sliced[1]));
-    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_sliced[2]));
-    EXPECT_TRUE(is_shallow_equivalent(col_sliced[0], col_new_sliced[0]));
-    EXPECT_TRUE(is_shallow_equivalent(col_sliced[1], col_new_sliced[1]));
-    EXPECT_TRUE(is_shallow_equivalent(col_sliced[2], col_new_sliced[2]));
-  }
-
-  // column_view, bit_cast         = diff hash
-  {
-    if constexpr (std::is_integral_v<TypeParam> and not std::is_same_v<TypeParam, bool>) {
-      using newType    = std::conditional_t<std::is_signed_v<TypeParam>,
-                                         std::make_unsigned_t<TypeParam>,
-                                         std::make_signed_t<TypeParam>>;
-      auto new_type    = cudf::data_type(cudf::type_to_id<newType>());
-      auto col_bitcast = cudf::bit_cast(col_view, new_type);
-      EXPECT_FALSE(is_shallow_equivalent(col_view, col_bitcast));
-    }
-  }
-}
-
-TYPED_TEST(ColumnViewShallowTests, is_shallow_equivalent_mutable)
-{
-  using namespace cudf::detail;
-  auto col      = example_column<TypeParam>();
-  auto col_view = cudf::column_view{*col};
-  // mutable_column_view, column_view = same hash
-  {
-    auto col_mutable = cudf::mutable_column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_view));
-  }
-  // mutable_column_view, modified mutable_column_view = same hash
-  // update the children column data = same hash
-  {
-    auto col_mutable = cudf::mutable_column_view{*col};
-    if constexpr (cudf::is_fixed_width<TypeParam>()) {
-      // Update data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->mutable_view().head());
-      cudf::set_null_mask(data, 1, 32, false);
-    } else {
-      // Update child(0).data
-      auto data = reinterpret_cast<cudf::bitmask_type*>(col->child(0).mutable_view().head());
-      cudf::set_null_mask(data, 1, 32, false);
-    }
-    EXPECT_TRUE(is_shallow_equivalent(col_view, col_mutable));
-    auto col_mutable_new = cudf::mutable_column_view{*col};
-    EXPECT_TRUE(is_shallow_equivalent(col_mutable, col_mutable_new));
-  }
-  // update the children column_views = diff hash
-  {
-    if constexpr (cudf::is_nested<TypeParam>()) {
-      col->child(0).set_null_mask(
-        cudf::create_null_mask(col->child(0).size(), cudf::mask_state::ALL_NULL));
-      auto col_child_updated = cudf::mutable_column_view{*col};
-      EXPECT_FALSE(is_shallow_equivalent(col_view, col_child_updated));
-    }
-  }
-}

From 15a7dcce5d9ce2413f9d39e97ff1e45b62467879 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 23 Sep 2021 17:29:51 -0500
Subject: [PATCH 21/26] Pin max `dask` and `distributed` versions to
 `2021.09.1` (#9286)

This PR pins max `dask` & `distributed` versions to `2021.09.1` for `21.10` release.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/brandon-b-miller
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/9286
---
 ci/benchmark/build.sh                    | 11 +++++++----
 ci/gpu/build.sh                          |  7 +++++--
 conda/environments/cudf_dev_cuda11.0.yml |  8 ++++----
 conda/environments/cudf_dev_cuda11.2.yml |  8 ++++----
 conda/recipes/custreamz/meta.yaml        |  4 ++--
 conda/recipes/dask-cudf/meta.yaml        |  8 ++++----
 python/custreamz/dev_requirements.txt    |  4 ++--
 python/dask_cudf/dev_requirements.txt    |  4 ++--
 python/dask_cudf/setup.py                |  8 ++++----
 9 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index e73153ce0c3..c2544ff7ffe 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -36,6 +36,9 @@ export GBENCH_BENCHMARKS_DIR="$WORKSPACE/cpp/build/gbenchmarks/"
 # like `/tmp` is.
 export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
     logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH"
@@ -75,10 +78,10 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 # conda install "your-pkg=1.0.0"
 
 # Install the master version of dask, distributed, and streamz
-logger "pip install git+https://github.com/dask/distributed.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/distributed.git@main" --upgrade --no-deps
-logger "pip install git+https://github.com/dask/dask.git@main --upgrade --no-deps"
-pip install "git+https://github.com/dask/dask.git@main" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+logger "pip install git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG --upgrade --no-deps"
+pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
 logger "pip install git+https://github.com/python-streamz/streamz.git@master --upgrade --no-deps"
 pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 918e4760c71..7c5b9d836dd 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
+# Dask & Distributed git tag
+export DASK_DISTRIBUTED_GIT_TAG='2021.09.1'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -101,8 +104,8 @@ function install_dask {
     # Install the main version of dask, distributed, and streamz
     gpuci_logger "Install the main version of dask, distributed, and streamz"
     set -x
-    pip install "git+https://github.com/dask/distributed.git@2021.07.1" --upgrade --no-deps
-    pip install "git+https://github.com/dask/dask.git@2021.07.1" --upgrade --no-deps
+    pip install "git+https://github.com/dask/distributed.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
+    pip install "git+https://github.com/dask/dask.git@$DASK_DISTRIBUTED_GIT_TAG" --upgrade --no-deps
     # Need to uninstall streamz that is already in the env.
     pip uninstall -y streamz
     pip install "git+https://github.com/python-streamz/streamz.git@master" --upgrade --no-deps
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index f975aded863..1476c294682 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -39,8 +39,8 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2021.6.0
-  - distributed>=2021.6.0
+  - dask=2021.09.1
+  - distributed=2021.09.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
   - transformers
   - pydata-sphinx-theme
   - pip:
-      - git+https://github.com/dask/dask.git@2021.07.1
-      - git+https://github.com/dask/distributed.git@2021.07.1
+      - git+https://github.com/dask/dask.git@2021.09.1
+      - git+https://github.com/dask/distributed.git@2021.09.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index f1487ee3e4b..37f1899fcf5 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -39,8 +39,8 @@ dependencies:
   - mypy=0.782
   - typing_extensions
   - pre_commit
-  - dask>=2021.6.0
-  - distributed>=2021.6.0
+  - dask=2021.09.1
+  - distributed=2021.09.1
   - streamz
   - arrow-cpp=5.0.0
   - dlpack>=0.5,<0.6.0a0
@@ -58,7 +58,7 @@ dependencies:
   - transformers
   - pydata-sphinx-theme
   - pip:
-      - git+https://github.com/dask/dask.git@2021.07.1
-      - git+https://github.com/dask/distributed.git@2021.07.1
+      - git+https://github.com/dask/dask.git@2021.09.1
+      - git+https://github.com/dask/distributed.git@2021.09.1
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index d0965e97567..db8aa8e6c85 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -31,8 +31,8 @@ requirements:
     - python
     - streamz 
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
     - python-confluent-kafka
     - cudf_kafka {{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 1b2c4efd610..45d96a2de85 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -26,13 +26,13 @@ requirements:
   host:
     - python
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
   run:
     - python
     - cudf {{ version }}
-    - dask>=2021.6.0
-    - distributed>=2021.6.0
+    - dask=2021.09.1
+    - distributed=2021.09.1
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/python/custreamz/dev_requirements.txt b/python/custreamz/dev_requirements.txt
index 61e4817b1c2..2f2a45dbe05 100644
--- a/python/custreamz/dev_requirements.txt
+++ b/python/custreamz/dev_requirements.txt
@@ -3,8 +3,8 @@
 flake8==3.8.3
 black==19.10b0
 isort==5.6.4
-dask>=2021.6.0
-distributed>=2021.6.0
+dask==2021.09.1
+distributed==2021.09.1
 streamz
 python-confluent-kafka
 pytest
diff --git a/python/dask_cudf/dev_requirements.txt b/python/dask_cudf/dev_requirements.txt
index 0b601180711..7d41184feae 100644
--- a/python/dask_cudf/dev_requirements.txt
+++ b/python/dask_cudf/dev_requirements.txt
@@ -1,7 +1,7 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
-dask>=2021.6.0
-distributed>=2021.6.0
+dask==2021.09.1
+distributed==2021.09.1
 fsspec>=0.6.0
 numba>=0.53.1
 numpy
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index c4cb57ff89a..515469f8b6c 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -10,8 +10,8 @@
 
 install_requires = [
     "cudf",
-    "dask>=2021.6.0",
-    "distributed>=2021.6.0",
+    "dask==2021.09.1",
+    "distributed==2021.09.1",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.4.0dev0",
@@ -23,8 +23,8 @@
         "pandas>=1.0,<1.4.0dev0",
         "pytest",
         "numba>=0.53.1",
-        "dask>=2021.6.0",
-        "distributed>=2021.6.0",
+        "dask==2021.09.1",
+        "distributed==2021.09.1",
     ]
 }
 

From 817c3fad1dd997c8e7dfcce2f13c39bc7320a38f Mon Sep 17 00:00:00 2001
From: Ayush Dattagupta <ayushdg95@gmail.com>
Date: Thu, 23 Sep 2021 16:43:30 -0700
Subject: [PATCH 22/26] Fix logic while parsing the sum statistic for numerical
 orc columns (#9183)

Fixes #9182.


In cases where the `sum` statistic was not present in the orc file for int and float columns, the values would be incorrectly interpreted as 0 because of protobuf's [default](https://developers.google.com/protocol-buffers/docs/proto#optional) values when fields are missing.

This PR adds a check for field presence before assignment.

Authors:
  - Ayush Dattagupta (https://github.com/ayushdg)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Marlene  (https://github.com/marlenezw)

URL: https://github.com/rapidsai/cudf/pull/9183
---
 python/cudf/cudf/io/orc.py         | 86 ++++++++++++++++++++++++------
 python/cudf/cudf/tests/test_orc.py | 71 ++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index cc5e1909d67..3aa672223c9 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -41,37 +41,90 @@ def _parse_column_statistics(cs, column_statistics_blob):
         column_statistics["number_of_values"] = cs.numberOfValues
     if cs.HasField("hasNull"):
         column_statistics["has_null"] = cs.hasNull
+
     if cs.HasField("intStatistics"):
-        column_statistics["minimum"] = cs.intStatistics.minimum
-        column_statistics["maximum"] = cs.intStatistics.maximum
-        column_statistics["sum"] = cs.intStatistics.sum
+        column_statistics["minimum"] = (
+            cs.intStatistics.minimum
+            if cs.intStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.intStatistics.maximum
+            if cs.intStatistics.HasField("maximum")
+            else None
+        )
+        column_statistics["sum"] = (
+            cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
+        )
+
     elif cs.HasField("doubleStatistics"):
-        column_statistics["minimum"] = cs.doubleStatistics.minimum
-        column_statistics["maximum"] = cs.doubleStatistics.maximum
-        column_statistics["sum"] = cs.doubleStatistics.sum
+        column_statistics["minimum"] = (
+            cs.doubleStatistics.minimum
+            if cs.doubleStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.doubleStatistics.maximum
+            if cs.doubleStatistics.HasField("maximum")
+            else None
+        )
+        column_statistics["sum"] = (
+            cs.doubleStatistics.sum
+            if cs.doubleStatistics.HasField("sum")
+            else None
+        )
+
     elif cs.HasField("stringStatistics"):
-        column_statistics["minimum"] = cs.stringStatistics.minimum
-        column_statistics["maximum"] = cs.stringStatistics.maximum
+        column_statistics["minimum"] = (
+            cs.stringStatistics.minimum
+            if cs.stringStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.stringStatistics.maximum
+            if cs.stringStatistics.HasField("maximum")
+            else None
+        )
         column_statistics["sum"] = cs.stringStatistics.sum
+
     elif cs.HasField("bucketStatistics"):
         column_statistics["true_count"] = cs.bucketStatistics.count[0]
         column_statistics["false_count"] = (
             column_statistics["number_of_values"]
             - column_statistics["true_count"]
         )
+
     elif cs.HasField("decimalStatistics"):
-        column_statistics["minimum"] = cs.decimalStatistics.minimum
-        column_statistics["maximum"] = cs.decimalStatistics.maximum
+        column_statistics["minimum"] = (
+            cs.decimalStatistics.minimum
+            if cs.decimalStatistics.HasField("minimum")
+            else None
+        )
+        column_statistics["maximum"] = (
+            cs.decimalStatistics.maximum
+            if cs.decimalStatistics.HasField("maximum")
+            else None
+        )
         column_statistics["sum"] = cs.decimalStatistics.sum
+
     elif cs.HasField("dateStatistics"):
-        column_statistics["minimum"] = datetime.datetime.fromtimestamp(
-            datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
-            datetime.timezone.utc,
+        column_statistics["minimum"] = (
+            datetime.datetime.fromtimestamp(
+                datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
+                datetime.timezone.utc,
+            )
+            if cs.dateStatistics.HasField("minimum")
+            else None
         )
-        column_statistics["maximum"] = datetime.datetime.fromtimestamp(
-            datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
-            datetime.timezone.utc,
+        column_statistics["maximum"] = (
+            datetime.datetime.fromtimestamp(
+                datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
+                datetime.timezone.utc,
+            )
+            if cs.dateStatistics.HasField("maximum")
+            else None
         )
+
     elif cs.HasField("timestampStatistics"):
         # Before ORC-135, the local timezone offset was included and they were
         # stored as minimum and maximum. After ORC-135, the timestamp is
@@ -87,6 +140,7 @@ def _parse_column_statistics(cs, column_statistics_blob):
             column_statistics["maximum"] = datetime.datetime.fromtimestamp(
                 cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
             )
+
     elif cs.HasField("binaryStatistics"):
         column_statistics["sum"] = cs.binaryStatistics.sum
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 61c2ff5ed36..1230b4b35f3 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -5,6 +5,7 @@
 import os
 import random
 from io import BytesIO
+from string import ascii_lowercase
 
 import numpy as np
 import pandas as pd
@@ -1421,3 +1422,73 @@ def test_orc_writer_lists_empty_rg(data):
 
     pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
     assert_eq(pdf_in, pdf_out)
+
+
+def test_statistics_sum_overflow():
+    maxint64 = np.iinfo(np.int64).max
+    minint64 = np.iinfo(np.int64).min
+
+    buff = BytesIO()
+    with po.Writer(
+        buff, po.Struct(a=po.BigInt(), b=po.BigInt(), c=po.BigInt())
+    ) as writer:
+        writer.write((maxint64, minint64, minint64))
+        writer.write((1, -1, 1))
+
+    file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
+    assert file_stats[0]["a"].get("sum") is None
+    assert file_stats[0]["b"].get("sum") is None
+    assert file_stats[0]["c"].get("sum") == minint64 + 1
+
+    assert stripe_stats[0]["a"].get("sum") is None
+    assert stripe_stats[0]["b"].get("sum") is None
+    assert stripe_stats[0]["c"].get("sum") == minint64 + 1
+
+
+def test_empty_statistics():
+    buff = BytesIO()
+    orc_schema = po.Struct(
+        a=po.BigInt(),
+        b=po.Double(),
+        c=po.String(),
+        d=po.Decimal(11, 2),
+        e=po.Date(),
+        f=po.Timestamp(),
+        g=po.Boolean(),
+        h=po.Binary(),
+        i=po.BigInt(),
+        # One column with non null value, else cudf/pyorc readers crash
+    )
+    data = tuple([None] * (len(orc_schema.fields) - 1) + [1])
+    with po.Writer(buff, orc_schema) as writer:
+        writer.write(data)
+
+    got = cudf.io.orc.read_orc_statistics([buff])
+
+    # Check for both file and stripe stats
+    for stats in got:
+        # Similar expected stats for the first 6 columns in this case
+        for col_name in ascii_lowercase[:6]:
+            assert stats[0][col_name].get("number_of_values") == 0
+            assert stats[0][col_name].get("has_null") is True
+            assert stats[0][col_name].get("minimum") is None
+            assert stats[0][col_name].get("maximum") is None
+        for col_name in ascii_lowercase[:3]:
+            assert stats[0][col_name].get("sum") == 0
+        # Sum for decimal column is a string
+        assert stats[0]["d"].get("sum") == "0"
+
+        assert stats[0]["g"].get("number_of_values") == 0
+        assert stats[0]["g"].get("has_null") is True
+        assert stats[0]["g"].get("true_count") == 0
+        assert stats[0]["g"].get("false_count") == 0
+
+        assert stats[0]["h"].get("number_of_values") == 0
+        assert stats[0]["h"].get("has_null") is True
+        assert stats[0]["h"].get("sum") == 0
+
+        assert stats[0]["i"].get("number_of_values") == 1
+        assert stats[0]["i"].get("has_null") is False
+        assert stats[0]["i"].get("minimum") == 1
+        assert stats[0]["i"].get("maximum") == 1
+        assert stats[0]["i"].get("sum") == 1

From 2a34daf04af781a188b4d844a48493ee3723813a Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Thu, 23 Sep 2021 19:33:33 -0600
Subject: [PATCH 23/26] Fix `interleave_columns` when the input string lists
 column having empty child column (#9292)

This closes #9290. In particular, when the input lists column (of strings) contain all empty lists, the internal function still tries to access the first element of the child column (which is empty) causes a seg-fault.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - https://github.com/nvdbaranec
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/9292
---
 cpp/src/lists/interleave_columns.cu           |  4 +++
 .../lists/combine/concatenate_rows_tests.cpp  | 36 ++++++++++++++-----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 4d1d6448dd0..b9b73d98ed2 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -169,6 +169,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const start_str_idx = list_offsets[list_id];
     auto const end_str_idx   = list_offsets[list_id + 1];
 
+    // In case of empty list (i.e. it doesn't contain any string element), we just ignore it because
+    // there will not be anything to store for that list in the child column.
+    if (start_str_idx == end_str_idx) { return; }
+
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 8aae523d12b..17d31c3e387 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -72,7 +72,7 @@ struct ListConcatenateRowsTypedTest : public cudf::test::BaseFixture {
 using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                         cudf::test::FloatingPointTypes,
                                         cudf::test::FixedPointTypes>;
-TYPED_TEST_CASE(ListConcatenateRowsTypedTest, TypesForTest);
+TYPED_TEST_SUITE(ListConcatenateRowsTypedTest, TypesForTest);
 
 TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateEmptyColumns)
 {
@@ -110,10 +110,12 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputNoNull)
 {
   using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
 
-  auto const col1     = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
-  auto const col2     = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
-  auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release();
-  auto const results  = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  auto const col1        = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release();
+  auto const empty_lists = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}}.release();
+  auto const col2        = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const expected    = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release();
+  auto const results =
+    cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
@@ -121,11 +123,13 @@ TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNullableChild)
 {
   using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
 
-  auto const col1 = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release();
-  auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
+  auto const col1        = ListsCol{{1, 2}, ListsCol{{null}, null_at(0)}, {5, 6}}.release();
+  auto const empty_lists = ListsCol{{ListsCol{}, ListsCol{}, ListsCol{}}, null_at(2)}.release();
+  auto const col2        = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release();
   auto const expected =
     ListsCol{{1, 2, 7, 8}, ListsCol{{null, 9, 10}, null_at(0)}, {5, 6, 11, 12}}.release();
-  auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}});
+  auto const results =
+    cudf::lists::concatenate_rows(TView{{col1->view(), empty_lists->view(), col2->view()}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
 }
 
@@ -466,3 +470,19 @@ TEST_F(ListConcatenateRowsTest, SlicedStringsColumnsInputWithNulls)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
   }
 }
+
+TEST_F(ListConcatenateRowsTest, StringsColumnsWithEmptyListTest)
+{
+  auto const col1 = StrListsCol{{"1", "2", "3", "4"}}.release();
+  auto const col2 = StrListsCol{{"a", "b", "c"}}.release();
+  auto const col3 = StrListsCol{StrListsCol{}}.release();
+  auto const col4 = StrListsCol{{"x", "y", "" /*NULL*/, "z"}, null_at(2)}.release();
+  auto const col5 = StrListsCol{{StrListsCol{}}, null_at(0)}.release();
+  auto const expected =
+    StrListsCol{{"1", "2", "3", "4", "a", "b", "c", "x", "y", "" /*NULL*/, "z"}, null_at(9)}
+      .release();
+  auto const results = cudf::lists::concatenate_rows(
+    TView{{col1->view(), col2->view(), col3->view(), col4->view(), col5->view()}});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, verbosity);
+}

From 20498f7c8e7929b994f4e2d9efd68d010a65b416 Mon Sep 17 00:00:00 2001
From: Mayank Anand <36782063+mayankanand007@users.noreply.github.com>
Date: Fri, 24 Sep 2021 09:32:28 -0400
Subject: [PATCH 24/26] Added deprecation warning for `.label_encoding()`
 (#9289)

This PR addresses issue #8608 by adding a deprecation warning before we remove the functionality entirely.

Authors:
  - Mayank Anand (https://github.com/mayankanand007)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9289
---
 python/cudf/cudf/core/series.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4ad2c325eeb..594f9fc42d0 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3637,6 +3637,12 @@ def label_encoding(self, cats, dtype=None, na_sentinel=-1):
         dtype: int8
         """
 
+        warnings.warn(
+            "Series.label_encoding is deprecated and will be removed in the future.\
+                 Consider using cuML's LabelEncoder instead",
+            DeprecationWarning,
+        )
+
         def _return_sentinel_series():
             return Series(
                 cudf.core.column.full(

From ad76ed1e09d74430435f0d520e6a02d9c5b5eea6 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <36027403+codereport@users.noreply.github.com>
Date: Fri, 24 Sep 2021 09:38:59 -0400
Subject: [PATCH 25/26] `fixed_point` `cudf::groupby` for `mean` aggregation
 (#9296)

This fixes https://github.com/rapidsai/cudf/issues/9224.

`fixed_point` `groupby` with `mean` shouldn't use `double` as the result type, it should use `fixed_point`. This PR fixes that.

Authors:
  - Conor Hoekstra (https://github.com/codereport)

Approvers:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9296
---
 .../cudf/detail/aggregation/aggregation.hpp   | 10 ++--
 cpp/tests/groupby/mean_tests.cpp              | 52 +++++++++++++++++++
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 4cf902ef562..5a1fc3b9398 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -954,14 +954,16 @@ template <typename Source, aggregation::Kind k>
 struct target_type_impl<
   Source,
   k,
-  std::enable_if_t<is_fixed_width<Source>() && !is_chrono<Source>() && (k == aggregation::MEAN)>> {
+  std::enable_if_t<is_fixed_width<Source>() && not is_chrono<Source>() &&
+                   not is_fixed_point<Source>() && (k == aggregation::MEAN)>> {
   using type = double;
 };
 
 template <typename Source, aggregation::Kind k>
-struct target_type_impl<Source,
-                        k,
-                        std::enable_if_t<is_chrono<Source>() && (k == aggregation::MEAN)>> {
+struct target_type_impl<
+  Source,
+  k,
+  std::enable_if_t<(is_chrono<Source>() or is_fixed_point<Source>()) && (k == aggregation::MEAN)>> {
   using type = Source;
 };
 
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index 613e1555b79..d390c8a1880 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -160,5 +160,57 @@ TEST_F(groupby_dictionary_mean_test, basic)
     keys, vals, expect_keys, expect_vals, cudf::make_mean_aggregation<groupby_aggregation>());
 }
 
+template <typename T>
+struct FixedPointTestBothReps : public cudf::test::BaseFixture {
+};
+
+TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+
+TYPED_TEST(FixedPointTestBothReps, GroupBySortMeanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale};
+
+    auto agg = cudf::make_mean_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(
+      keys, vals, expect_keys, expect_vals_min, std::move(agg), force_use_sort_impl::YES);
+  }
+}
+
+TYPED_TEST(FixedPointTestBothReps, GroupByHashMeanDecimalAsValue)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using K          = int32_t;
+
+  for (auto const i : {2, 1, 0, -1, -2}) {
+    auto const scale = scale_type{i};
+    // clang-format off
+    auto const keys  = fixed_width_column_wrapper<K>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+    auto const vals  = fp_wrapper{                  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, scale};
+    // clang-format on
+
+    auto const expect_keys     = fixed_width_column_wrapper<K>{1, 2, 3};
+    auto const expect_vals_min = fp_wrapper{{3, 4, 5}, scale};
+
+    auto agg = cudf::make_mean_aggregation<cudf::groupby_aggregation>();
+    test_single_agg(keys, vals, expect_keys, expect_vals_min, std::move(agg));
+  }
+}
+
 }  // namespace test
 }  // namespace cudf

From ba763105e006494a536c1a2fafc5112ab3dae362 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 24 Sep 2021 09:37:11 -0500
Subject: [PATCH 26/26] Support for using tdigests to compute approximate
 percentiles. (#8983)

Addresses  https://github.com/rapidsai/cudf/issues/7170

Adds 3 pieces of new functionality:

- A `TDIGEST` aggregation which creates a tdigest column (https://arxiv.org/pdf/1902.04023.pdf) from a stream of input scalars.
- A `MERGE_TDIGEST` aggregation which merges multiple tdigest columns into a new one.
- a `percentile_approx` function which performs percentile queries on tdigest data.

Also exposes several ::detail functions (`sort`, `merge`, `slice`) in detail headers.

Ready for review.  I do need to add more tests though.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Jake Hemstad (https://github.com/jrhemstad)
  - MithunR (https://github.com/mythrocks)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/8983
---
 conda/recipes/libcudf/meta.yaml               |   1 +
 cpp/CMakeLists.txt                            |   4 +-
 cpp/include/cudf/aggregation.hpp              |  79 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  76 ++
 cpp/include/cudf/detail/copy.hpp              |   9 +
 cpp/include/cudf/detail/merge.cuh             |  17 +
 cpp/include/cudf/detail/quantiles.hpp         |  18 +-
 cpp/include/cudf/detail/sorting.hpp           |  16 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  79 ++
 cpp/include/cudf/quantiles.hpp                |  28 +
 cpp/include/cudf/sorting.hpp                  |   6 +-
 cpp/include/cudf_test/column_utilities.hpp    |   7 +-
 cpp/src/aggregation/aggregation.cpp           |  41 +
 cpp/src/copying/slice.cu                      |  34 +-
 cpp/src/groupby/sort/aggregate.cpp            |  91 ++
 cpp/src/groupby/sort/group_reductions.hpp     |  88 ++
 cpp/src/groupby/sort/group_tdigest.cu         | 841 ++++++++++++++++++
 cpp/src/quantiles/tdigest/tdigest.cu          | 383 ++++++++
 cpp/src/sort/sort.cu                          |   8 +-
 cpp/src/sort/stable_sort.cu                   |   4 +-
 cpp/tests/CMakeLists.txt                      |   2 +
 cpp/tests/groupby/groupby_test_util.hpp       |  55 ++
 cpp/tests/groupby/tdigest_tests.cu            | 584 ++++++++++++
 cpp/tests/quantiles/percentile_approx_test.cu | 435 +++++++++
 cpp/tests/utilities/column_utilities.cu       |  61 +-
 25 files changed, 2919 insertions(+), 48 deletions(-)
 create mode 100644 cpp/include/cudf/detail/tdigest/tdigest.hpp
 create mode 100644 cpp/src/groupby/sort/group_tdigest.cu
 create mode 100644 cpp/src/quantiles/tdigest/tdigest.cu
 create mode 100644 cpp/tests/groupby/tdigest_tests.cu
 create mode 100644 cpp/tests/quantiles/percentile_approx_test.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index c3450fe8d88..fd687de6698 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -93,6 +93,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/sequence.hpp
     - test -f $PREFIX/include/cudf/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
+    - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
     - test -f $PREFIX/include/cudf/detail/transform.hpp
     - test -f $PREFIX/include/cudf/detail/transpose.hpp
     - test -f $PREFIX/include/cudf/detail/unary.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2df35aa0971..00af1973cfe 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -236,8 +236,9 @@ add_library(cudf
     src/groupby/sort/group_max_scan.cu
     src/groupby/sort/group_min_scan.cu
     src/groupby/sort/group_rank_scan.cu
-    src/groupby/sort/group_sum_scan.cu
     src/groupby/sort/group_replace_nulls.cu
+    src/groupby/sort/group_sum_scan.cu
+    src/groupby/sort/group_tdigest.cu
     src/groupby/sort/sort_helper.cu
     src/hash/hashing.cu
     src/hash/md5_hash.cu
@@ -318,6 +319,7 @@ add_library(cudf
     src/merge/merge.cu
     src/partitioning/partitioning.cu
     src/partitioning/round_robin.cu
+    src/quantiles/tdigest/tdigest.cu
     src/quantiles/quantile.cu
     src/quantiles/quantiles.cu
     src/reductions/all.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index c302895880d..fb6401a3cc1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -87,7 +87,9 @@ class aggregation {
     CUDA,            ///< CUDA UDF based reduction
     MERGE_LISTS,     ///< merge multiple lists values into one list
     MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2         ///< merge partial values of M2 aggregation
+    MERGE_M2,        ///< merge partial values of M2 aggregation
+    TDIGEST,         ///< create a tdigest from a set of input values
+    MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
   };
 
   aggregation() = delete;
@@ -493,5 +495,80 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a TDIGEST aggregation
+ *
+ * Produces a tdigest (https://arxiv.org/pdf/1902.04023.pdf) column from input values.
+ * The input aggregation values are expected to be fixed-width numeric types.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids = 1000);
+
+/**
+ * @brief Factory to create a MERGE_TDIGEST aggregation
+ *
+ * Merges the results from a previous aggregation resulting from a `make_tdigest_aggregation`
+ * or `make_merge_tdigest_aggregation` to produce a new a tdigest
+ * (https://arxiv.org/pdf/1902.04023.pdf) column.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param max_centroids Parameter controlling compression level and accuracy on subsequent
+ * queries on the output tdigest data.  `max_centroids` places an upper bound on the size of
+ * the computed tdigests: A value of 1000 will result in a tdigest containing no
+ * more than 1000 centroids (32 bytes each). Higher result in more accurate tdigest information.
+ *
+ * @returns A MERGE_TDIGEST aggregation object.
+ */
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 5a1fc3b9398..05d1bf3e595 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -91,6 +91,10 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class tdigest_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(
+    data_type col_type, class merge_tdigest_aggregation const& agg);
 };
 
 class aggregation_finalizer {  // Declares the interface for the finalizer
@@ -125,6 +129,8 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class tdigest_aggregation const& agg);
+  virtual void visit(class merge_tdigest_aggregation const& agg);
 };
 
 /**
@@ -884,6 +890,54 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying TDIGEST aggregation
+ */
+class tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit tdigest_aggregation(int max_centroids_)
+    : aggregation{TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
+/**
+ * @brief Derived aggregation class for specifying MERGE_TDIGEST aggregation
+ */
+class merge_tdigest_aggregation final : public groupby_aggregation {
+ public:
+  explicit merge_tdigest_aggregation(int max_centroids_)
+    : aggregation{MERGE_TDIGEST}, max_centroids{max_centroids_}
+  {
+  }
+
+  int const max_centroids;
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<merge_tdigest_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Sentinel value used for `ARGMAX` aggregation.
  *
@@ -1120,6 +1174,24 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Always use numeric types for TDIGEST
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::TDIGEST,
+                        std::enable_if_t<(is_numeric<Source>() || is_fixed_point<Source>())>> {
+  using type = struct_view;
+};
+
+// TDIGEST_MERGE. The root column type for a tdigest column is a list_view. Strictly
+// speaking, this check is not sufficient to guarantee we are actually being given a
+// real tdigest column, but we will do further verification inside the aggregation code.
+template <typename Source>
+struct target_type_impl<Source,
+                        aggregation::MERGE_TDIGEST,
+                        std::enable_if_t<std::is_same_v<Source, cudf::struct_view>>> {
+  using type = struct_view;
+};
+
 /**
  * @brief Helper alias to get the accumulator type for performing aggregation
  * `k` on elements of type `Source`
@@ -1224,6 +1296,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::TDIGEST:
+      return f.template operator()<aggregation::TDIGEST>(std::forward<Ts>(args)...);
+    case aggregation::MERGE_TDIGEST:
+      return f.template operator()<aggregation::MERGE_TDIGEST>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index fb5cfad6186..9f06661c8d1 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -75,6 +75,15 @@ std::vector<column_view> slice(column_view const& input,
                                std::vector<size_type> const& indices,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
+/**
+ * @copydoc cudf::slice(table_view const&,std::vector<size_type> const&)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
  * rmm::mr::device_memory_resource*)
diff --git a/cpp/include/cudf/detail/merge.cuh b/cpp/include/cudf/detail/merge.cuh
index a779c3defbb..ec83e348e33 100644
--- a/cpp/include/cudf/detail/merge.cuh
+++ b/cpp/include/cudf/detail/merge.cuh
@@ -145,5 +145,22 @@ struct row_lexicographic_tagged_comparator {
   order const* _column_order{};
 };
 
+/**
+ * @copydoc std::unique_ptr<cudf::table> merge(
+ *            std::vector<table_view> const& tables_to_merge,
+ *            std::vector<cudf::size_type> const& key_cols,
+ *            std::vector<cudf::order> const& column_order,
+ *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merge,
+                                   std::vector<cudf::size_type> const& key_cols,
+                                   std::vector<cudf::order> const& column_order,
+                                   std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 5fb2ce4cbe6..7a76f9cab88 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -22,7 +22,8 @@
 namespace cudf {
 namespace detail {
 
-/** @copydoc cudf::quantile()
+/**
+ * @copydoc cudf::quantile()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,8 @@ std::unique_ptr<column> quantile(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/** @copydoc cudf::quantiles()
+/**
+ * @copydoc cudf::quantiles()
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -49,5 +51,17 @@ std::unique_ptr<table> quantiles(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::percentile_approx(column_view const&, column_view const&,
+ * rmm::mr::device_memory_resource*)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> percentile_approx(
+  column_view const& input,
+  column_view const& percentiles,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 3127a5f89f1..b5dfb34c043 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -32,7 +32,7 @@ namespace detail {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -44,7 +44,7 @@ std::unique_ptr<column> sorted_order(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
@@ -90,5 +90,17 @@ std::unique_ptr<table> segmented_sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::sort
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> sort(
+  table_view const& values,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
new file mode 100644
index 00000000000..94c22911c1e
--- /dev/null
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace detail {
+
+namespace tdigest {
+
+// mean and weight column indices within tdigest inner struct columns
+constexpr size_type mean_column_index   = 0;
+constexpr size_type weight_column_index = 1;
+
+// min and max column indices within tdigest outer struct columns
+constexpr size_type centroid_column_index = 0;
+constexpr size_type min_column_index      = 1;
+constexpr size_type max_column_index      = 2;
+
+/**
+ * @brief Verifies that the input column is a valid tdigest column.
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param col    Column to be checkeed
+ *
+ * @throws cudf::logic error if the column is not a valid tdigest column.
+ */
+void check_is_valid_tdigest_column(column_view const& col);
+
+/**
+ * @brief Create an empty tdigest column.
+ *
+ * An empty tdigest column contains a single row of length 0
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ *
+ * @returns An empty tdigest column.
+ */
+std::unique_ptr<column> make_empty_tdigest_column(
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+}  // namespace tdigest
+}  // namespace detail
+}  // namespace cudf
\ No newline at end of file
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 94b5c344f4f..d21f6dff79c 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -94,5 +95,32 @@ std::unique_ptr<table> quantiles(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Calculate approximate percentiles on an input tdigest column.
+ *
+ * tdigest (https://arxiv.org/pdf/1902.04023.pdf) columns are produced specifically
+ * by the TDIGEST and MERGE_TDIGEST aggregations.  These columns represent
+ * compressed representations of a very large input data set that can be
+ * queried for quantile information.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest from `input` row `i`. The length of each output list
+ * is the number of percentages specified in `percentages`.
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @throws cudf::logic_error if `input` is not a valid tdigest column.
+ * @throws cudf::logic_error if `percentiles` is not a FLOAT64 column.
+ *
+ * @returns LIST Column containing requested percentile values as FLOAT64.
+ */
+std::unique_ptr<column> percentile_approx(
+  structs_column_view const& input,
+  column_view const& percentiles,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 36a8131a78e..69eb8b3490a 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -58,7 +58,7 @@ enum class rank_method {
  * `input` if it were sorted
  */
 std::unique_ptr<column> sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -72,7 +72,7 @@ std::unique_ptr<column> sorted_order(
  * @copydoc cudf::sorted_order
  */
 std::unique_ptr<column> stable_sorted_order(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
@@ -112,7 +112,7 @@ bool is_sorted(cudf::table_view const& table,
  * @return New table containing the desired sorted order of `input`
  */
 std::unique_ptr<table> sort(
-  table_view input,
+  table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index 553d8a97bd2..aa77686fee4 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -38,6 +38,8 @@ enum class debug_output_level {
   QUIET             // no debug output
 };
 
+constexpr size_type default_ulp = 4;
+
 /**
  * @brief Verifies the property equality of two columns.
  *
@@ -93,12 +95,15 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  * @param lhs The first column
  * @param rhs The second column
  * @param verbosity Level of debug output verbosity
+ * @param fp_ulps # of ulps of tolerance to allow when comparing
+ * floating point values
  *
  * @returns True if the columns (and their properties) are equivalent, false otherwise
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity = debug_output_level::FIRST_ERROR);
+                               debug_output_level verbosity = debug_output_level::FIRST_ERROR,
+                               size_type fp_ulps            = cudf::test::default_ulp);
 
 /**
  * @brief Verifies the bitwise equality of two device memory buffers.
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index c3d992e1181..b550b61785b 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -202,6 +202,18 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, merge_tdigest_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 // aggregation_finalizer ----------------------------------------
 
 void aggregation_finalizer::visit(aggregation const& agg) {}
@@ -346,6 +358,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 }  // namespace detail
 
 std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
@@ -668,6 +690,25 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+template <typename Base>
+std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
+template <typename Base>
+std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids)
+{
+  return std::make_unique<detail::merge_tdigest_aggregation>(max_centroids);
+}
+template std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
+  int max_centroids);
+template std::unique_ptr<groupby_aggregation> make_merge_tdigest_aggregation<groupby_aggregation>(
+  int max_centroids);
+
 namespace detail {
 namespace {
 struct target_type_functor {
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index 0e41689dc4b..d1c12056393 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -63,17 +63,9 @@ std::vector<column_view> slice(column_view const& input,
   return std::vector<column_view>{begin, begin + indices.size() / 2};
 }
 
-}  // namespace detail
-
-std::vector<cudf::column_view> slice(cudf::column_view const& input,
-                                     std::vector<size_type> const& indices)
-{
-  CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, rmm::cuda_stream_default);
-}
-
-std::vector<cudf::table_view> slice(cudf::table_view const& input,
-                                    std::vector<size_type> const& indices)
+std::vector<table_view> slice(table_view const& input,
+                              std::vector<size_type> const& indices,
+                              rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
@@ -81,7 +73,7 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
 
   // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j]
   // where i is the i'th column of the j'th table_view
-  auto op = [&indices](auto const& c) { return cudf::slice(c, indices); };
+  auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); };
   auto f  = thrust::make_transform_iterator(input.begin(), op);
 
   auto sliced_table = std::vector<std::vector<cudf::column_view>>(f, f + input.num_columns());
@@ -99,6 +91,22 @@ std::vector<cudf::table_view> slice(cudf::table_view const& input,
   }
 
   return result;
-};
+}
+
+}  // namespace detail
+
+std::vector<cudf::column_view> slice(cudf::column_view const& input,
+                                     std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
+
+std::vector<cudf::table_view> slice(cudf::table_view const& input,
+                                    std::vector<size_type> const& indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+}
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 726b51b7702..9f3d67ac38b 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -525,6 +525,97 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 };
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::tdigest_aggregation const&>(agg).max_centroids;
+
+  auto count_agg = make_count_aggregation();
+  operator()<aggregation::COUNT_VALID>(*count_agg);
+  column_view valid_counts = cache.get_result(col_idx, *count_agg);
+
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_tdigest(
+                     get_sorted_values(),
+                     helper.group_offsets(stream),
+                     helper.group_labels(stream),
+                     {valid_counts.begin<size_type>(), static_cast<size_t>(valid_counts.size())},
+                     helper.num_groups(stream),
+                     max_centroids,
+                     stream,
+                     mr));
+};
+
+/**
+ * @brief Generate a merged tdigest column from a grouped set of input tdigest columns.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregation const& agg)
+{
+  if (cache.has_result(col_idx, agg)) { return; }
+
+  auto const max_centroids =
+    dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
+  cache.add_result(col_idx,
+                   agg,
+                   detail::group_merge_tdigest(get_grouped_values(),
+                                               helper.group_offsets(stream),
+                                               helper.group_labels(stream),
+                                               helper.num_groups(stream),
+                                               max_centroids,
+                                               stream,
+                                               mr));
+};
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 2770162da2d..cb01ee8e053 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -442,6 +442,94 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Generate a tdigest column from a grouped set of numeric input values.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped (and sorted) values to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param group_valid_counts Per-group counts of valid elements.
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_tdigest(column_view const& values,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Merges tdigests within the same group to generate a new tdigest.
+ *
+ * The tdigest column produced is of the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    },
+ *    ...
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ *
+ * Each output row is a single tdigest.  The length of the row is the "size" of the
+ * tdigest, each element of which represents a weighted centroid (mean, weight).
+ *
+ * @param values Grouped tdigests to merge.
+ * @param group_offsets Offsets of groups' starting points within @p values.
+ * @param group_labels 0-based ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups.
+ * @param max_centroids Parameter controlling the level of compression of the tdigest. Higher
+ * values result in a larger, more precise tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns tdigest column, with 1 tdigest per row
+ */
+std::unique_ptr<column> group_merge_tdigest(column_view const& values,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr);
+
 /** @endinternal
  *
  */
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
new file mode 100644
index 00000000000..5b4252a9063
--- /dev/null
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/merge.cuh>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/utilities/span.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+namespace {
+
+// the most representative point within a cluster of similar
+// values. {mean, weight}
+// NOTE: Using a tuple here instead of a struct to take advantage of
+// thrust zip iterators for output.
+using centroid = thrust::tuple<double, double, bool>;
+
+// make a centroid from a scalar with a weight of 1.
+template <typename T>
+struct make_centroid {
+  column_device_view const col;
+
+  centroid operator() __device__(size_type index)
+  {
+    return {static_cast<double>(col.element<T>(index)), 1, col.is_valid(index)};
+  }
+};
+
+// make a centroid from an input stream of mean/weight values.
+struct make_weighted_centroid {
+  double const* mean;
+  double const* weight;
+
+  centroid operator() __device__(size_type index) { return {mean[index], weight[index], true}; }
+};
+
+// merge two centroids
+struct merge_centroids {
+  centroid operator() __device__(centroid const& lhs, centroid const& rhs)
+  {
+    bool const lhs_valid = thrust::get<2>(lhs);
+    bool const rhs_valid = thrust::get<2>(rhs);
+    if (!lhs_valid && !rhs_valid) { return {0, 0, false}; }
+    if (!lhs_valid) { return rhs; }
+    if (!rhs_valid) { return lhs; }
+
+    double const lhs_mean   = thrust::get<0>(lhs);
+    double const rhs_mean   = thrust::get<0>(rhs);
+    double const lhs_weight = thrust::get<1>(lhs);
+    double const rhs_weight = thrust::get<1>(rhs);
+    double const new_weight = lhs_weight + rhs_weight;
+    return {(lhs_mean * lhs_weight + rhs_mean * rhs_weight) / new_weight, new_weight, true};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the nearest weight that will be <= the next limit is simply the nearest integer < the limit,
+ * which we can get by just taking floor(next_limit).  For example if our next limit is 3.56, the
+ * nearest whole number <= it is floor(3.56) == 3.
+ */
+struct nearest_value_scalar_weights {
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type)
+  {
+    double const f = floor(next_limit);
+    return {f, max(0, static_cast<int>(next_limit) - 1)};
+  }
+};
+
+/**
+ * @brief A functor which returns the nearest cumulative weight in the input stream prior to the
+ * specified next weight limit.
+ *
+ * This functor assumes we are dealing with grouped, sorted, weighted centroids.
+ */
+struct nearest_value_centroid_weights {
+  double const* cumulative_weights;
+  offset_type const* outer_offsets;  // groups
+  offset_type const* inner_offsets;  // tdigests within a group
+
+  thrust::pair<double, int> operator() __device__(double next_limit, size_type group_index)
+  {
+    auto const tdigest_begin = outer_offsets[group_index];
+    auto const tdigest_end   = outer_offsets[group_index + 1];
+    auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
+
+    auto const index = ((thrust::lower_bound(thrust::seq,
+                                             group_cumulative_weights,
+                                             group_cumulative_weights + num_weights,
+                                             next_limit)) -
+                        group_cumulative_weights);
+
+    return index == 0 ? thrust::pair<double, int>{0, 0}
+                      : thrust::pair<double, int>{group_cumulative_weights[index - 1], index - 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input values.
+ *
+ * This functor assumes the weight for all scalars is simply 1. Under this assumption,
+ * the cumulative weight for a given value index I is simply I+1.
+ */
+struct cumulative_scalar_weight {
+  cudf::device_span<size_type const> group_offsets;
+  cudf::device_span<size_type const> group_labels;
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const group_index          = group_labels[value_index];
+    auto const relative_value_index = value_index - group_offsets[group_index];
+    return {group_index, relative_value_index, relative_value_index + 1};
+  }
+};
+
+/**
+ * @brief A functor which returns the cumulative input weight for a given index in a
+ * set of grouped input centroids.
+ *
+ * This functor assumes we are dealing with grouped, weighted centroids.
+ */
+struct cumulative_centroid_weight {
+  double const* cumulative_weights;
+  cudf::device_span<size_type const> group_labels;
+  offset_type const* outer_offsets;                    // groups
+  cudf::device_span<offset_type const> inner_offsets;  // tdigests with a group
+
+  std::tuple<size_type, size_type, double> operator() __device__(size_type value_index) const
+  {
+    auto const tdigest_index =
+      static_cast<size_type>(
+        thrust::upper_bound(thrust::seq, inner_offsets.begin(), inner_offsets.end(), value_index) -
+        inner_offsets.begin()) -
+      1;
+    auto const group_index                 = group_labels[tdigest_index];
+    auto const first_tdigest_index         = outer_offsets[group_index];
+    auto const first_weight_index          = inner_offsets[first_tdigest_index];
+    auto const relative_value_index        = value_index - first_weight_index;
+    double const* group_cumulative_weights = cumulative_weights + first_weight_index;
+
+    return {group_index, relative_value_index, group_cumulative_weights[relative_value_index]};
+  }
+};
+
+// a monotonically increasing scale function which produces a distribution
+// of centroids that is more densely packed in the middle of the input
+// than at the ends.
+__device__ double scale_func_k1(double quantile, double delta_norm)
+{
+  double k = delta_norm * asin(2.0 * quantile - 1.0);
+  k += 1.0;
+  double q = (sin(k / delta_norm) + 1.0) / 2.0;
+  return q;
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated. 1 thread
+ * per group.
+ *
+ * This kernel is called in a two-pass style.  Once to compute the per-group
+ * cluster sizes and total # of clusters, and once to compute the actual
+ * weight limits per cluster.
+ *
+ * @param delta_              tdigest compression level
+ * @param num_groups          The number of input groups
+ * @param nearest_weight_     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight_       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
+ * @param group_num_clusters  Output.  The number of output clusters for each input group.
+ * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
+ *
+ */
+template <typename TotalWeightIter, typename NearestWeightFunc, typename CumulativeWeight>
+__global__ void generate_cluster_limits_kernel(int delta_,
+                                               size_type num_groups,
+                                               NearestWeightFunc nearest_weight,
+                                               TotalWeightIter total_weight_,
+                                               CumulativeWeight cumulative_weight,
+                                               double* group_cluster_wl,
+                                               size_type* group_num_clusters,
+                                               offset_type const* group_cluster_offsets)
+{
+  int const tid          = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const group_index = tid;
+  if (group_index >= num_groups) { return; }
+
+  // we will generate at most delta clusters.
+  double const delta              = static_cast<double>(delta_);
+  double const delta_norm         = delta / (2.0 * M_PI);
+  double const total_weight       = total_weight_[group_index];
+  group_num_clusters[group_index] = 0;
+  // a group with nothing in it.
+  if (total_weight <= 0) { return; }
+
+  // start at the correct place based on our cluster offset.
+  double* cluster_wl =
+    group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr;
+
+  double cur_limit        = 0.0;
+  double cur_weight       = 0.0;
+  double next_limit       = -1.0;
+  int last_inserted_index = -1;
+
+  // compute the first cluster limit
+  double nearest_w;
+  int nearest_w_index;
+  while (1) {
+    cur_weight = next_limit < 0 ? 0 : max(cur_weight + 1, nearest_w);
+    if (cur_weight >= total_weight) { break; }
+
+    // based on where we are closing the cluster off (not including the incoming weight),
+    // compute the next cluster limit
+    double const quantile = cur_weight / total_weight;
+    next_limit            = total_weight * scale_func_k1(quantile, delta_norm);
+
+    // if the next limit is < the cur limit, we're past the end of the distribution, so we're done.
+    if (next_limit <= cur_limit) {
+      if (cluster_wl) { cluster_wl[group_num_clusters[group_index]] = total_weight; }
+      group_num_clusters[group_index]++;
+      break;
+    }
+
+    // compute the weight we will be at in the input values just before closing off the current
+    // cluster (because adding the next value will cross the current limit).
+    // NOTE: can't use structured bindings here.
+    thrust::tie(nearest_w, nearest_w_index) = nearest_weight(next_limit, group_index);
+
+    if (cluster_wl) {
+      // because of the way the scale functions work, it is possible to generate clusters
+      // in such a way that we end up with "gaps" where there are no input values that
+      // fall into a given cluster.  An example would be this:
+      //
+      // cluster weight limits = 0.00003, 1.008, 3.008
+      //
+      // input values(weight) = A(1), B(2), C(3)
+      //
+      // naively inserting these values into the clusters simply by taking a lower_bound,
+      // we would get the following distribution of input values into those 3 clusters.
+      //  (), (A), (B,C)
+      //
+      // whereas what we really want is:
+      //
+      //  (A), (B), (C)
+      //
+      // to fix this, we will artificially adjust the output cluster limits to guarantee
+      // at least 1 input value will be put in each cluster during the reduction step.
+      // this does not affect final centroid results as we still use the "real" weight limits
+      // to compute subsequent clusters - the purpose is only to allow cluster selection
+      // during the reduction step to be trivial.
+      //
+      double adjusted_next_limit = next_limit;
+      if (nearest_w_index == last_inserted_index || last_inserted_index < 0) {
+        nearest_w_index       = last_inserted_index + 1;
+        auto [r, i, adjusted] = cumulative_weight(nearest_w_index);
+        adjusted_next_limit   = max(next_limit, adjusted);
+      }
+      cluster_wl[group_num_clusters[group_index]] = adjusted_next_limit;
+      last_inserted_index                         = nearest_w_index;
+    }
+    group_num_clusters[group_index]++;
+    cur_limit = next_limit;
+  }
+}
+
+/**
+ * @brief Compute a set of cluster limits (brackets, essentially) for a
+ * given tdigest based on the specified delta and the total weight of values
+ * to be added.
+ *
+ * The number of clusters generated will always be <= delta_, where delta_ is
+ * a reasonably small number likely << 10000.
+ *
+ * Each input group gets an independent set of clusters generated.
+ *
+ * @param delta_             tdigest compression level
+ * @param num_groups         The number of input groups
+ * @param nearest_weight     A functor which returns the nearest weight in the input
+ * stream that falls before our current cluster limit
+ * @param total_weight       A functor which returns the expected total weight for
+ * the entire stream of input values for the specified group.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tuple containing the set of cluster weight limits for each group, a set of
+ * list-style offsets indicating group sizes, and the total number of clusters
+ */
+template <typename TotalWeightIter, typename NearestWeight, typename CumulativeWeight>
+std::tuple<rmm::device_uvector<double>, std::unique_ptr<column>, size_type>
+generate_group_cluster_info(int delta,
+                            size_type num_groups,
+                            NearestWeight nearest_weight,
+                            TotalWeightIter total_weight,
+                            CumulativeWeight cumulative_weight,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(num_groups, block_size);
+
+  // compute number of clusters per group
+  // each thread computes 1 set of clusters (# of cluster sets == # of groups)
+  rmm::device_uvector<size_type> group_num_clusters(num_groups, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    nullptr,
+    group_num_clusters.begin(),
+    nullptr);
+
+  // generate group cluster offsets (where the clusters for a given group start and end)
+  auto group_cluster_offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr);
+  auto cluster_size = cudf::detail::make_counting_transform_iterator(
+    0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) {
+      return index == num_groups ? 0 : group_num_clusters[index];
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         cluster_size,
+                         cluster_size + num_groups + 1,
+                         group_cluster_offsets->mutable_view().begin<offset_type>(),
+                         0);
+
+  // total # of clusters
+  offset_type total_clusters =
+    cudf::detail::get_value<offset_type>(group_cluster_offsets->view(), num_groups, stream);
+
+  // fill in the actual cluster weight limits
+  rmm::device_uvector<double> group_cluster_wl(total_clusters, stream);
+  generate_cluster_limits_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    delta,
+    num_groups,
+    nearest_weight,
+    total_weight,
+    cumulative_weight,
+    group_cluster_wl.begin(),
+    group_num_clusters.begin(),
+    group_cluster_offsets->view().begin<offset_type>());
+
+  return {std::move(group_cluster_wl),
+          std::move(group_cluster_offsets),
+          static_cast<size_type>(total_clusters)};
+}
+
+/**
+ * @brief Compute a column of tdigests.
+ *
+ * Assembles the output tdigest column based on the specified delta, a stream of
+ * input values (either scalar or centroids), and an assortment of per-group
+ * clustering information.
+ *
+ * This function is effectively just a reduce_by_key that performs a reduction
+ * from input values -> centroid clusters as defined by the the cluster weight
+ * boundaries.
+ *
+ * @param delta              tdigest compression level
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
+ * @param cumulative_weight  Functor which returns cumulative weight and group information for
+ * an absolute input value index.
+ * @param min_col            Column containing the minimum value per group.
+ * @param max_col            Column containing the maximum value per group.
+ * @param group_cluster_wl   Cluster weight limits for each group.
+ * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
+ * @param total_clusters     Total number of clusters in all groups.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A tdigest column with 1 row per output tdigest.
+ */
+template <typename CentroidIter, typename CumulativeWeight>
+std::unique_ptr<column> compute_tdigests(int delta,
+                                         CentroidIter centroids_begin,
+                                         CentroidIter centroids_end,
+                                         CumulativeWeight group_cumulative_weight,
+                                         std::unique_ptr<column>&& min_col,
+                                         std::unique_ptr<column>&& max_col,
+                                         rmm::device_uvector<double> const& group_cluster_wl,
+                                         std::unique_ptr<column>&& group_cluster_offsets,
+                                         size_type total_clusters,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  // the output for each group is column of data that represents the tdigest. since we want 1 row
+  // per group, each row will be a list the length of the tdigest for that group. so our output
+  // column is of the form:
+  // struct {
+  //   centroids for the digest
+  //   list {
+  //     struct {
+  //       double    // mean
+  //       double    // weight
+  //     }
+  //   }
+  //   double       // min
+  //   double       // max
+  // }
+  //
+  //
+  if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  std::vector<std::unique_ptr<column>> inner_children;
+  // mean
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // weight
+  inner_children.push_back(cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
+  // tdigest struct
+  auto tdigests =
+    cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr);
+
+  // each input group represents an individual tdigest.  within each tdigest, we want the keys
+  // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
+  // into the range 0-99).  But since we have multiple tdigests, we need to keep the keys unique
+  // between the groups, so we add our group start offset.
+  auto keys = thrust::make_transform_iterator(
+    thrust::make_counting_iterator(0),
+    [delta,
+     group_cluster_wl      = group_cluster_wl.data(),
+     group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
+     group_cumulative_weight] __device__(size_type value_index) -> size_type {
+      auto [group_index, relative_value_index, cumulative_weight] =
+        group_cumulative_weight(value_index);
+
+      // compute start of cluster weight limits for this group
+      double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
+      auto const num_clusters =
+        group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index];
+
+      // local cluster index
+      size_type const group_cluster_index =
+        min(num_clusters - 1,
+            static_cast<size_type>(
+              thrust::lower_bound(
+                thrust::seq, weight_limits, weight_limits + num_clusters, cumulative_weight) -
+              weight_limits));
+
+      // add the cluster offset to generate a globally unique key
+      return group_cluster_index + group_cluster_offsets[group_index];
+    });
+
+  // reduce the centroids down by key.
+  cudf::mutable_column_view mean_col =
+    tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view();
+  cudf::mutable_column_view weight_col =
+    tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view();
+  auto output           = thrust::make_zip_iterator(thrust::make_tuple(
+    mean_col.begin<double>(), weight_col.begin<double>(), thrust::make_discard_iterator()));
+  auto const num_values = std::distance(centroids_begin, centroids_end);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        keys,
+                        keys + num_values,                // keys
+                        centroids_begin,                  // values
+                        thrust::make_discard_iterator(),  // key output
+                        output,                           // output
+                        thrust::equal_to<size_type>{},    // key equality check
+                        merge_centroids{});
+
+  // create the list
+  auto const num_groups = group_cluster_offsets->size() - 1;
+  auto list             = cudf::make_lists_column(
+    num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {});
+
+  // create final tdigest column
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr);
+}
+
+// retrieve total weight of scalar inputs by group index
+struct scalar_total_weight {
+  size_type const* group_valid_counts;
+  __device__ double operator()(size_type group_index) { return group_valid_counts[group_index]; }
+};
+
+// return the min/max value of scalar inputs by group index
+template <typename T>
+struct get_scalar_minmax {
+  column_device_view const col;
+  device_span<size_type const> group_offsets;
+  size_type const* group_valid_counts;
+
+  __device__ thrust::tuple<double, double> operator()(size_type group_index)
+  {
+    // note: .element<T>() is taking care of fixed-point conversions for us.
+    return {static_cast<double>(col.element<T>(group_offsets[group_index])),
+            static_cast<double>(
+              col.element<T>(group_offsets[group_index] + (group_valid_counts[group_index] - 1)))};
+  }
+};
+
+struct typed_group_tdigest {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    // first, generate cluster weight information for each input group
+    auto total_weight = cudf::detail::make_counting_transform_iterator(
+      0, scalar_total_weight{group_valid_counts.begin()});
+    auto [group_cluster_wl, group_cluster_offsets, total_clusters] =
+      generate_group_cluster_info(delta,
+                                  num_groups,
+                                  nearest_value_scalar_weights{},
+                                  total_weight,
+                                  cumulative_scalar_weight{group_offsets, group_labels},
+                                  stream,
+                                  mr);
+
+    // device column view. handy because the .element() function
+    // automatically handles fixed-point conversions for us
+    auto d_col = cudf::column_device_view::create(col);
+
+    // compute min and max columns
+    auto min_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    auto max_col = cudf::make_fixed_width_column(
+      data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+    thrust::transform(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + num_groups,
+      thrust::make_zip_iterator(thrust::make_tuple(min_col->mutable_view().begin<double>(),
+                                                   max_col->mutable_view().begin<double>())),
+      get_scalar_minmax<T>{*d_col, group_offsets, group_valid_counts.begin()});
+
+    // for simple input values, the "centroids" all have a weight of 1.
+    auto scalar_to_centroid =
+      cudf::detail::make_counting_transform_iterator(0, make_centroid<T>{*d_col});
+
+    // generate the final tdigest
+    return compute_tdigests(delta,
+                            scalar_to_centroid,
+                            scalar_to_centroid + col.size(),
+                            cumulative_scalar_weight{group_offsets, group_labels},
+                            std::move(min_col),
+                            std::move(max_col),
+                            group_cluster_wl,
+                            std::move(group_cluster_offsets),
+                            total_clusters,
+                            stream,
+                            mr);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& col,
+                                     cudf::device_span<size_type const> group_offsets,
+                                     cudf::device_span<size_type const> group_labels,
+                                     cudf::device_span<size_type const> group_valid_counts,
+                                     size_type num_groups,
+                                     int delta,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    CUDF_FAIL("Non-numeric type in group_tdigest");
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<column> group_tdigest(column_view const& col,
+                                      cudf::device_span<size_type const> group_offsets,
+                                      cudf::device_span<size_type const> group_labels,
+                                      cudf::device_span<size_type const> group_valid_counts,
+                                      size_type num_groups,
+                                      int max_centroids,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+
+  auto const delta = max_centroids;
+  return cudf::type_dispatcher(col.type(),
+                               typed_group_tdigest{},
+                               col,
+                               group_offsets,
+                               group_labels,
+                               group_valid_counts,
+                               num_groups,
+                               delta,
+                               stream,
+                               mr);
+}
+
+std::unique_ptr<column> group_merge_tdigest(column_view const& input,
+                                            cudf::device_span<size_type const> group_offsets,
+                                            cudf::device_span<size_type const> group_labels,
+                                            size_type num_groups,
+                                            int max_centroids,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(input);
+
+  if (num_groups == 0 || input.size() == 0) {
+    return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
+  }
+
+  structs_column_view scv(input);
+  lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  // ideally, we would just call .parent().child() here because tdigests cannot be
+  // sliced. however, lists_column_view() hides that particular interface. However,
+  // for the same reason, get_sliced_child() should be just as cheap.
+  auto data = lcv.get_sliced_child(stream);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(cudf::detail::tdigest::mean_column_index);
+  auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index);
+
+  // first step is to merge all the tdigests in each group. at the moment the only way to
+  // make this work is to retrieve the group sizes (via group_offsets) and the individual digest
+  // sizes (via input.offsets()) to the gpu and do the merges.  The scale problem is that while the
+  // size of each group will likely be small (size of each group will typically map to # of batches
+  // the input data was chopped into for tdigest generation), the -number- of groups can be
+  // arbitrarily large.
+  //
+  // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
+  // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
+  // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the
+  // outer groups.
+
+  // bring group offsets back to the host
+  std::vector<size_type> h_outer_offsets(group_offsets.size());
+  cudaMemcpyAsync(h_outer_offsets.data(),
+                  group_offsets.data(),
+                  sizeof(size_type) * group_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  // bring tdigest offsets back to the host
+  auto tdigest_offsets = lcv.offsets();
+  std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
+  cudaMemcpyAsync(h_inner_offsets.data(),
+                  tdigest_offsets.begin<size_type>(),
+                  sizeof(size_type) * tdigest_offsets.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+
+  stream.synchronize();
+
+  // extract all means and weights into a table
+  cudf::table_view tdigests_unsliced({mean, weight});
+
+  // generate the merged (but not yet compressed) tdigests for each group.
+  std::vector<std::unique_ptr<table>> tdigests;
+  tdigests.reserve(num_groups);
+  std::transform(
+    h_outer_offsets.begin(),
+    h_outer_offsets.end() - 1,
+    std::next(h_outer_offsets.begin()),
+    std::back_inserter(tdigests),
+    [&](auto tdigest_start, auto tdigest_end) {
+      // the range of tdigests in this group
+      auto const num_tdigests = tdigest_end - tdigest_start;
+
+      // slice each tdigest from the input
+      std::vector<table_view> unmerged_tdigests;
+      unmerged_tdigests.reserve(num_tdigests);
+      auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start);
+      std::transform(offset_iter,
+                     offset_iter + num_tdigests,
+                     std::next(offset_iter),
+                     std::back_inserter(unmerged_tdigests),
+                     [&](auto start, auto end) {
+                       return cudf::detail::slice(tdigests_unsliced, {start, end}, stream);
+                     });
+
+      // merge
+      return cudf::detail::merge(unmerged_tdigests, {0}, {order::ASCENDING}, {}, stream, mr);
+    });
+
+  // generate min and max values
+  auto min_col        = scv.child(cudf::detail::tdigest::min_column_index);
+  auto merged_min_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        min_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_min_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::minimum<double>{});
+
+  auto max_col        = scv.child(cudf::detail::tdigest::max_column_index);
+  auto merged_max_col = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        max_col.begin<double>(),
+                        thrust::make_discard_iterator(),
+                        merged_max_col->mutable_view().begin<double>(),
+                        thrust::equal_to<size_type>{},  // key equality check
+                        thrust::maximum<double>{});
+
+  // concatenate all the merged tdigests back into one table.
+  std::vector<table_view> tdigest_views;
+  tdigest_views.reserve(num_groups);
+  std::transform(tdigests.begin(),
+                 tdigests.end(),
+                 std::back_inserter(tdigest_views),
+                 [](std::unique_ptr<table> const& t) { return t->view(); });
+  auto merged = cudf::detail::concatenate(tdigest_views, stream, mr);
+
+  // generate cumulative weights
+  auto merged_weights     = merged->get_column(cudf::detail::tdigest::weight_column_index).view();
+  auto cumulative_weights = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED);
+  auto keys = cudf::detail::make_counting_transform_iterator(
+    0,
+    [group_labels      = group_labels.begin(),
+     inner_offsets     = tdigest_offsets.begin<size_type>(),
+     num_inner_offsets = tdigest_offsets.size()] __device__(int index) {
+      // what -original- tdigest index this absolute index corresponds to
+      auto const iter = thrust::prev(
+        thrust::upper_bound(thrust::seq, inner_offsets, inner_offsets + num_inner_offsets, index));
+      auto const tdigest_index = thrust::distance(inner_offsets, iter);
+
+      // what group index the original tdigest belongs to
+      return group_labels[tdigest_index];
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + cumulative_weights->size(),
+                                merged_weights.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto const delta = max_centroids;
+
+  // generate cluster info
+  auto total_group_weight = cudf::detail::make_counting_transform_iterator(
+    0,
+    [outer_offsets = group_offsets.data(),
+     inner_offsets = tdigest_offsets.begin<size_type>(),
+     cumulative_weights =
+       cumulative_weights->view().begin<double>()] __device__(size_type group_index) {
+      auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1;
+      return cumulative_weights[last_weight_index];
+    });
+  auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
+    delta,
+    num_groups,
+    nearest_value_centroid_weights{cumulative_weights->view().begin<double>(),
+                                   group_offsets.data(),
+                                   tdigest_offsets.begin<size_type>()},
+    total_group_weight,
+    cumulative_centroid_weight{
+      cumulative_weights->view().begin<double>(),
+      group_labels,
+      group_offsets.data(),
+      {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    stream,
+    mr);
+
+  // input centroid values
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0,
+    make_weighted_centroid{
+      merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin<double>(),
+      merged_weights.begin<double>()});
+
+  // compute the tdigest
+  return compute_tdigests(delta,
+                          centroids,
+                          centroids + merged->num_rows(),
+                          cumulative_centroid_weight{cumulative_weights->view().begin<double>(),
+                                                     group_labels,
+                                                     group_offsets.data(),
+                                                     {tdigest_offsets.begin<offset_type>(),
+                                                      static_cast<size_t>(tdigest_offsets.size())}},
+                          std::move(merged_min_col),
+                          std::move(merged_max_col),
+                          group_cluster_wl,
+                          std::move(group_cluster_offsets),
+                          total_clusters,
+                          stream,
+                          mr);
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
new file mode 100644
index 00000000000..9aea59a195b
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+
+namespace cudf {
+namespace detail {
+namespace tdigest {
+
+// https://developer.nvidia.com/blog/lerp-faster-cuda/
+template <typename T>
+__device__ inline T lerp(T v0, T v1, T t)
+{
+  return fma(t, v1, fma(-t, v0, v0));
+}
+
+struct centroid {
+  double mean;
+  double weight;
+};
+
+struct make_centroid {
+  double const* means;
+  double const* weights;
+  __device__ centroid operator()(size_type i) { return {means[i], weights[i]}; }
+};
+
+// kernel for computing percentiles on input tdigest (mean, weight) centroid data.
+template <typename CentroidIter>
+__global__ void compute_percentiles_kernel(device_span<offset_type const> tdigest_offsets,
+                                           column_device_view percentiles,
+                                           CentroidIter centroids_,
+                                           double const* min_,
+                                           double const* max_,
+                                           double const* cumulative_weight_,
+                                           double* output)
+{
+  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto const num_tdigests  = tdigest_offsets.size() - 1;
+  auto const tdigest_index = tid / percentiles.size();
+  if (tdigest_index >= num_tdigests) { return; }
+  auto const pindex = tid % percentiles.size();
+
+  // size of the digest we're querying
+  auto const tdigest_size = tdigest_offsets[tdigest_index + 1] - tdigest_offsets[tdigest_index];
+  // no work to do. values will be set to null
+  if (tdigest_size == 0 || !percentiles.is_valid(pindex)) { return; }
+
+  output[tid] = [&]() {
+    double const percentage         = percentiles.element<double>(pindex);
+    double const* cumulative_weight = cumulative_weight_ + tdigest_offsets[tdigest_index];
+
+    // centroids for this particular tdigest
+    CentroidIter centroids = centroids_ + tdigest_offsets[tdigest_index];
+
+    // min and max for the digest
+    double const* min_val = min_ + tdigest_index;
+    double const* max_val = max_ + tdigest_index;
+
+    double const total_weight = cumulative_weight[tdigest_size - 1];
+
+    // The following Arrow code serves as a basis for this computation
+    // https://github.com/apache/arrow/blob/master/cpp/src/arrow/util/tdigest.cc#L280
+    double const weighted_q = percentage * total_weight;
+    if (weighted_q <= 1) {
+      return *min_val;
+    } else if (weighted_q >= total_weight - 1) {
+      return *max_val;
+    }
+
+    // determine what centroid this weighted quantile falls within.
+    size_type const centroid_index = static_cast<size_type>(thrust::distance(
+      cumulative_weight,
+      thrust::lower_bound(
+        thrust::seq, cumulative_weight, cumulative_weight + tdigest_size, weighted_q)));
+    centroid c                     = centroids[centroid_index];
+
+    // diff == how far from the "center" of the centroid we are,
+    // in unit weights.
+    // visually:
+    //
+    // centroid of weight 7
+    //        C       <-- center of the centroid
+    //    |-------|
+    //      | |  |
+    //      X Y  Z
+    // X has a diff of -2 (2 units to the left of the center of the centroid)
+    // Y has a diff of 0 (directly in the middle of the centroid)
+    // Z has a diff of 3 (3 units to the right of the center of the centroid)
+    double const diff = weighted_q + c.weight / 2 - cumulative_weight[centroid_index];
+
+    // if we're completely within a centroid of weight 1, just return that.
+    if (c.weight == 1 && std::abs(diff) < 0.5) { return c.mean; }
+
+    // otherwise, interpolate between two centroids.
+
+    // get the two centroids we want to interpolate between
+    auto const look_left  = diff < 0;
+    auto const [lhs, rhs] = [&]() {
+      if (look_left) {
+        // if we're at the first centroid, "left" of us is the min value
+        auto const first_centroid = centroid_index == 0;
+        auto const lhs = first_centroid ? centroid{*min_val, 0} : centroids[centroid_index - 1];
+        auto const rhs = c;
+        return std::pair<centroid, centroid>{lhs, rhs};
+      } else {
+        // if we're at the last centroid, "right" of us is the max value
+        auto const last_centroid = (centroid_index == tdigest_size - 1);
+        auto const lhs           = c;
+        auto const rhs = last_centroid ? centroid{*max_val, 0} : centroids[centroid_index + 1];
+        return std::pair<centroid, centroid>{lhs, rhs};
+      }
+    }();
+
+    // compute interpolation value t
+
+    // total interpolation range. the total range of "space" between the lhs and rhs centroids.
+    auto const tip = lhs.weight / 2 + rhs.weight / 2;
+    // if we're looking left, diff is negative, so shift it so that we are interpolating
+    // from lhs -> rhs.
+    auto const t = (look_left) ? (diff + tip) / tip : diff / tip;
+
+    // interpolate
+    return lerp(lhs.mean, rhs.mean, t);
+  }();
+}
+
+/**
+ * @brief Calculate approximate percentiles on a provided tdigest column.
+ *
+ * Produces a LIST column where each row `i` represents output from querying the
+ * corresponding tdigest of from row `i` in `input`. The length of each output list
+ * is the number of percentiles specified in `percentiles`
+ *
+ * @param input           tdigest input data. One tdigest per row.
+ * @param percentiles     Desired percentiles in range [0, 1].
+ * @param stream          CUDA stream used for device memory operations and kernel launches
+ * @param mr              Device memory resource used to allocate the returned column's device
+ * memory
+ *
+ * @returns Column of doubles containing requested percentile values.
+ */
+std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& input,
+                                                   column_view const& percentiles,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
+{
+  lists_column_view lcv(input.child(centroid_column_index));
+  column_view min_col = input.child(min_column_index);
+  column_view max_col = input.child(max_column_index);
+
+  // offsets, representing the size of each tdigest
+  auto offsets = lcv.offsets();
+
+  // extract means and weights
+  auto data = lcv.parent().child(lists_column_view::child_column_index);
+  structs_column_view tdigest(data);
+  auto mean   = tdigest.child(mean_column_index);
+  auto weight = tdigest.child(weight_column_index);
+
+  // compute summed weights
+  auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64},
+                                                          mean.size(),
+                                                          mask_state::UNALLOCATED,
+                                                          stream,
+                                                          rmm::mr::get_current_device_resource());
+  auto keys               = cudf::detail::make_counting_transform_iterator(
+    0,
+    [offsets_begin = offsets.begin<offset_type>(),
+     offsets_end   = offsets.end<offset_type>()] __device__(size_type i) {
+      return thrust::distance(
+        offsets_begin,
+        thrust::prev(thrust::upper_bound(thrust::seq, offsets_begin, offsets_end, i)));
+    });
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                keys,
+                                keys + weight.size(),
+                                weight.begin<double>(),
+                                cumulative_weights->mutable_view().begin<double>());
+
+  auto percentiles_cdv = column_device_view::create(percentiles);
+
+  // leaf is a column of size input.size() * percentiles.size()
+  auto const num_output_values = input.size() * percentiles.size();
+
+  // null percentiles become null results.
+  auto [null_mask, null_count] = [&]() {
+    return percentiles.null_count() != 0
+             ? cudf::detail::valid_if(
+                 thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(0) + num_output_values,
+                 [percentiles = *percentiles_cdv] __device__(size_type i) {
+                   return percentiles.is_valid(i % percentiles.size());
+                 })
+             : std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, 0};
+  }();
+
+  auto result = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr);
+
+  auto centroids = cudf::detail::make_counting_transform_iterator(
+    0, make_centroid{mean.begin<double>(), weight.begin<double>()});
+
+  constexpr size_type block_size = 256;
+  cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
+  compute_percentiles_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(
+    {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
+    *percentiles_cdv,
+    centroids,
+    min_col.begin<double>(),
+    max_col.begin<double>(),
+    cumulative_weights->view().begin<double>(),
+    result->mutable_view().begin<double>());
+
+  return result;
+}
+
+void check_is_valid_tdigest_column(column_view const& col)
+{
+  // sanity check that this is actually tdigest data
+  CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows");
+  CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
+  CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column");
+
+  structs_column_view scv(col);
+  CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+
+  lists_column_view lcv(scv.child(centroid_column_index));
+  auto data = lcv.child();
+  CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(data.num_children() == 2,
+               "Encountered tdigest column with an invalid number of children");
+  auto mean = data.child(mean_column_index);
+  CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column");
+  auto weight = data.child(weight_column_index);
+  CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column");
+}
+
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
+{
+  // mean/weight columns
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
+
+  auto offsets = cudf::make_fixed_width_column(
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               offsets->mutable_view().begin<offset_type>(),
+               offsets->mutable_view().end<offset_type>(),
+               0);
+  auto list =
+    make_lists_column(1,
+                      std::move(offsets),
+                      cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr),
+                      0,
+                      {});
+
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               0);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::fill(rmm::exec_policy(stream),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               0);
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+
+  return make_structs_column(1, std::move(children), 0, {}, stream, mr);
+}
+
+}  // namespace tdigest.
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  tdigest::check_is_valid_tdigest_column(input);
+  CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
+               "percentile_approx expects float64 percentile inputs");
+
+  // output is a list column with each row containing percentiles.size() percentile values
+  auto offsets = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  auto row_size_iter = thrust::make_constant_iterator(percentiles.size());
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         row_size_iter,
+                         row_size_iter + input.size() + 1,
+                         offsets->mutable_view().begin<offset_type>());
+
+  if (percentiles.size() == 0) {
+    return cudf::make_lists_column(
+      input.size(),
+      std::move(offsets),
+      cudf::make_empty_column(data_type{type_id::FLOAT64}),
+      input.size(),
+      cudf::detail::create_null_mask(
+        input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr));
+  }
+
+  // if any of the input digests are empty, nullify the corresponding output rows (values will be
+  // uninitialized)
+  auto [bitmask, null_count] = [stream, mr, input]() {
+    lists_column_view lcv(input.child(tdigest::centroid_column_index));
+    auto iter = cudf::detail::make_counting_transform_iterator(
+      0, [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 1 : 0;
+      });
+    auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0);
+    if (null_count == 0) {
+      return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
+    }
+    return cudf::detail::valid_if(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(0) + input.size(),
+      [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
+        return offsets[index + 1] - offsets[index] == 0 ? 0 : 1;
+      },
+      stream,
+      mr);
+  }();
+
+  return cudf::make_lists_column(
+    input.size(),
+    std::move(offsets),
+    tdigest::compute_approx_percentiles(input, percentiles, stream, mr),
+    null_count,
+    std::move(bitmask),
+    stream,
+    mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+                                          column_view const& percentiles,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  return percentile_approx(input, percentiles, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index dc74a5f4ff1..42b57bdb47a 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,7 +26,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
@@ -75,7 +75,7 @@ struct inplace_column_sort_fn {
   }
 };
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
@@ -101,7 +101,7 @@ std::unique_ptr<table> sort(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> sorted_order(table_view input,
+std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::mr::device_memory_resource* mr)
@@ -110,7 +110,7 @@ std::unique_ptr<column> sorted_order(table_view input,
   return detail::sorted_order(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
-std::unique_ptr<table> sort(table_view input,
+std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 860e88ae76e..75335579de2 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -25,7 +25,7 @@
 
 namespace cudf {
 namespace detail {
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
@@ -36,7 +36,7 @@ std::unique_ptr<column> stable_sorted_order(table_view input,
 
 }  // namespace detail
 
-std::unique_ptr<column> stable_sorted_order(table_view input,
+std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::mr::device_memory_resource* mr)
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 03f7967cee0..6d385ff969d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -85,6 +85,7 @@ ConfigureTest(GROUPBY_TEST
     groupby/sum_of_squares_tests.cpp
     groupby/sum_scan_tests.cpp
     groupby/sum_tests.cpp
+    groupby/tdigest_tests.cu
     groupby/var_tests.cpp)
 
 ###################################################################################################
@@ -123,6 +124,7 @@ ConfigureTest(HASH_MAP_TEST
 ###################################################################################################
 # - quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(QUANTILES_TEST
+    quantiles/percentile_approx_test.cu
     quantiles/quantile_test.cpp
     quantiles/quantiles_test.cpp)
 
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index 542205b5b51..b333d9dacba 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -27,6 +27,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <random>
 
 namespace cudf {
 namespace test {
@@ -128,5 +131,57 @@ inline void test_single_scan(column_view const& keys,
     expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
 }
 
+template <typename T>
+inline T frand()
+{
+  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+}
+
+template <typename T>
+inline T rand_range(T min, T max)
+{
+  return min + static_cast<T>(frand<T>() * (max - min));
+}
+
+inline std::unique_ptr<column> generate_typed_percentile_distribution(
+  std::vector<double> const& buckets,
+  std::vector<int> const& sizes,
+  data_type t,
+  bool sorted = false)
+{
+  srand(0);
+
+  std::vector<double> values;
+  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
+  values.reserve(total_size);
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    double min = idx == 0 ? 0.0f : buckets[idx - 1];
+    double max = buckets[idx];
+
+    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
+      values.push_back(rand_range(min, max));
+    }
+  }
+
+  if (sorted) { std::sort(values.begin(), values.end()); }
+
+  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
+  return cudf::cast(src, t);
+}
+
+// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
+// is to provide a standardized set of inputs for use with tdigest generation tests and
+// percentile_approx tests. std::vector<double>
+// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
+// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+inline std::unique_ptr<column> generate_standardized_percentile_distribution(
+  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
+{
+  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
+  std::vector<int> b_sizes{
+    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
new file mode 100644
index 00000000000..818999867c1
--- /dev/null
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow/util/tdigest.h"
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <thrust/fill.h>
+
+namespace cudf {
+namespace test {
+
+using namespace cudf;
+
+typedef thrust::tuple<size_type, double, double> expected_value;
+
+template <typename T>
+struct TDigestAllTypes : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes);
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+void tdigest_sample_compare(column_view const& result,
+                            std::vector<expected_value> const& h_expected)
+{
+  cudf::detail::tdigest::check_is_valid_tdigest_column(result);
+  cudf::structs_column_view scv(result);
+  cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
+  cudf::structs_column_view tdigests(lcv.child());
+  column_view result_mean   = tdigests.child(cudf::detail::tdigest::mean_column_index);
+  column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index);
+
+  auto expected_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto expected_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_mean = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+  auto sampled_result_weight = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
+
+  rmm::device_vector<expected_value> expected(h_expected.begin(), h_expected.end());
+  auto iter = thrust::make_counting_iterator(0);
+  thrust::for_each(
+    rmm::exec_policy(rmm::cuda_stream_default),
+    iter,
+    iter + expected.size(),
+    [expected            = expected.data().get(),
+     expected_mean       = expected_mean->mutable_view().begin<double>(),
+     expected_weight     = expected_weight->mutable_view().begin<double>(),
+     result_mean         = result_mean.begin<double>(),
+     result_weight       = result_weight.begin<double>(),
+     sampled_result_mean = sampled_result_mean->mutable_view().begin<double>(),
+     sampled_result_weight =
+       sampled_result_weight->mutable_view().begin<double>()] __device__(size_type index) {
+      expected_mean[index]         = thrust::get<1>(expected[index]);
+      expected_weight[index]       = thrust::get<2>(expected[index]);
+      auto const src_index         = thrust::get<0>(expected[index]);
+      sampled_result_mean[index]   = result_mean[src_index];
+      sampled_result_weight[index] = result_weight[src_index];
+    });
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_mean, *sampled_result_mean);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_weight, *sampled_result_weight);
+}
+
+template <typename T>
+std::unique_ptr<column> make_expected_tdigest(column_view const& mean,
+                                              column_view const& weight,
+                                              T min,
+                                              T max)
+{
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(std::make_unique<cudf::column>(mean));
+  inner_children.push_back(std::make_unique<cudf::column>(weight));
+  // tdigest struct
+  auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {});
+
+  std::vector<offset_type> h_offsets{0, mean.size()};
+  auto offsets =
+    cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
+  cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+             h_offsets.data(),
+             sizeof(offset_type) * 2,
+             cudaMemcpyHostToDevice);
+
+  auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
+
+  auto min_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               min_col->mutable_view().begin<double>(),
+               min_col->mutable_view().end<double>(),
+               static_cast<double>(min));
+  auto max_col =
+    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               max_col->mutable_view().begin<double>(),
+               max_col->mutable_view().end<double>(),
+               static_cast<double>(max));
+
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(list));
+  children.push_back(std::move(min_col));
+  children.push_back(std::move(max_col));
+  return make_structs_column(1, std::move(children), 0, {});
+}
+
+TYPED_TEST(TDigestAllTypes, Simple)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{126, 15, 1, 99, 67};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 15, 67, 99, 126});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 126;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, SimpleWithNulls)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {1, 0, 1, 0, 1, 0, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  cudf::test::fixed_width_column_wrapper<T> raw_mean({1, 44, 67, 100, 122});
+  cudf::test::fixed_width_column_wrapper<double> weight{1, 1, 1, 1, 1};
+  auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
+  double const min = 1;
+  double const max = 122;
+  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, AllNull)
+{
+  using T = TypeParam;
+
+  // create a tdigest that has far fewer values in it than the delta value. this should result
+  // in every value remaining uncompressed
+  cudf::test::fixed_width_column_wrapper<T> values{{122, 15, 1, 99, 67, 101, 100, 84, 44, 2},
+                                                   {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+  auto result     = cudf::type_dispatcher(
+    static_cast<column_view>(values).type(), tdigest_gen{}, keys, values, delta);
+
+  // NOTE: an empty tdigest column still has 1 row.
+  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TYPED_TEST(TDigestAllTypes, LargeGroups)
+{
+  auto _values    = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  int const delta = 1000;
+
+  // generate a random set of keys
+  std::vector<int> h_keys;
+  h_keys.reserve(_values->size());
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(iter, iter + _values->size(), std::back_inserter(h_keys), [](int i) {
+    return static_cast<int>(round(rand_range(0, 8)));
+  });
+  cudf::test::fixed_width_column_wrapper<int> _keys(h_keys.begin(), h_keys.end());
+
+  // group the input values together
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby setup_gb(k);
+  cudf::table_view v({*_values});
+  auto groups = setup_gb.get_groups(v);
+
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  // generate a seperate tdigest for each group
+  std::vector<std::unique_ptr<column>> parts;
+  std::transform(
+    iter, iter + values.size(), std::back_inserter(parts), [&keys, &values, delta](int i) {
+      cudf::table_view t({keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+  auto merged_parts = cudf::concatenate(part_views);
+
+  // generate a tdigest on the whole input set
+  cudf::table_view t({_keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({*_values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  // verify that they end up the same.
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result.second[0].results[0], *merged_parts);
+}
+
+struct TDigestTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(TDigestTest, LargeInputDouble)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00040692343794663995, 7},
+                                         {10, 0.16234555627091204477, 153},
+                                         {59, 5.12764811246045937310, 858},
+                                         {250, 62.54581814492237157310, 2356},
+                                         {368, 87.85834376680742252574, 1735},
+                                         {409, 94.07685720279611985006, 1272},
+                                         {491, 99.94197663121231300920, 130},
+                                         {500, 99.99969880795092080916, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07265722021410986331, 739},
+                                         {7, 8.19766194442652640362, 10693},
+                                         {16, 36.82277869518204482802, 20276},
+                                         {29, 72.95424834129075009059, 22623},
+                                         {38, 90.61229683516096145013, 15581},
+                                         {46, 99.07283498858802772702, 5142},
+                                         {50, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15508346777729631327, 71618},
+                                         {1, 33.04971680740474226923, 187499},
+                                         {2, 62.50566666553867634093, 231762},
+                                         {3, 83.46216572053654658703, 187500},
+                                         {4, 96.42204425201593664951, 71620},
+                                         {5, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputInt)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::INT32});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 7},
+                                         {14, 0, 212},
+                                         {26, 0.83247422680412408447, 388},
+                                         {44, 2, 648},
+                                         {45, 2.42598187311178170589, 662},
+                                         {342, 82.75190258751908345403, 1971},
+                                         {383, 90, 1577},
+                                         {417, 94.88376068376066996279, 1170},
+                                         {418, 95, 1157},
+                                         {479, 99, 307},
+                                         {500, 99, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0, 739},
+                                         {7, 7.71486018890863167741, 10693},
+                                         {16, 36.32491615703294485229, 20276},
+                                         {29, 72.44392874508245938614, 22623},
+                                         {38, 90.14209614273795523332, 15581},
+                                         {46, 98.64041229093737683797, 5142},
+                                         {50, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 6.66025300902007799664, 71618},
+                                         {1, 32.54912826201739051157, 187499},
+                                         {2, 62.00734805533262772315, 231762},
+                                         {3, 82.96355733333332693746, 187500},
+                                         {4, 95.91280368612116546956, 71620},
+                                         {5, 99, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+TEST_F(TDigestTest, LargeInputDecimal)
+{
+  // these tests are being done explicitly because of the way we have to precompute the correct
+  // answers. since the input values generated by the generate_distribution() function below are
+  // cast to specific types -before- being sent into the aggregation, I can't (safely) just use the
+  // expected values that you get when using doubles all the way through.  so I have to pregenerate
+  // the correct answers for each type by hand. so, we'll choose a reasonable subset (double,
+  // decimal, int, bool)
+
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4});
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // compare against a sample of known/expected values (which themselves were verified against the
+  // Arrow implementation)
+
+  // delta 1000
+  {
+    int const delta = 1000;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.00035714285714285709, 7},
+                                         {10, 0.16229738562091505782, 153},
+                                         {59, 5.12759696969697031932, 858},
+                                         {250, 62.54576854838715860296, 2356},
+                                         {368, 87.85829446685879418055, 1735},
+                                         {409, 94.07680636792450457051, 1272},
+                                         {491, 99.94192461538463589932, 130},
+                                         {500, 99.99965000000000259206, 2}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 100
+  {
+    int const delta = 100;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 0.07260811907983763525, 739},
+                                         {7, 8.19761183016926864298, 10693},
+                                         {16, 36.82272891595975750079, 20276},
+                                         {29, 72.95419827167043536065, 22623},
+                                         {38, 90.61224673640975879607, 15581},
+                                         {46, 99.07278498638662256326, 5142},
+                                         {50, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+
+  // delta 10
+  {
+    int const delta = 10;
+    auto result =
+      cudf::type_dispatcher(values->view().type(), tdigest_gen{}, *keys, *values, delta);
+    std::vector<expected_value> expected{{0, 7.15503361864335740705, 71618},
+                                         {1, 33.04966679715625588187, 187499},
+                                         {2, 62.50561666407782013266, 231762},
+                                         {3, 83.46211575573336460820, 187500},
+                                         {4, 96.42199425300195514410, 71620},
+                                         {5, 99.99970000000000425189, 1}};
+
+    tdigest_sample_compare(*result, expected);
+  }
+}
+
+struct TDigestMergeTest : public cudf::test::BaseFixture {
+};
+
+// Note: there is no need to test different types here as the internals of a tdigest are always
+// the same regardless of input.
+TEST_F(TDigestMergeTest, Simple)
+{
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  auto split_values = cudf::split(*values, {250000, 500000});
+  auto split_keys   = cudf::split(*keys, {250000, 500000});
+
+  int const delta = 1000;
+
+  // generate seperate digests
+  std::vector<std::unique_ptr<column>> parts;
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(
+    iter,
+    iter + split_values.size(),
+    std::back_inserter(parts),
+    [&split_keys, &split_values, delta](int i) {
+      cudf::table_view t({split_keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({split_values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+
+  // merge delta = 1000
+  {
+    int const merge_delta = 1000;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 0, 0};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+
+    std::vector<expected_value> expected{{0, 0.00013945158577498588, 2},
+                                         {10, 0.04804393446447510763, 50},
+                                         {59, 1.68846964439246893797, 284},
+                                         {250, 33.36323141295877547918, 1479},
+                                         {368, 65.36307727957283475462, 2292},
+                                         {409, 73.95399208218296394080, 1784},
+                                         {490, 87.67566167909056673579, 1570},
+                                         {491, 87.83119717763385381204, 1570},
+                                         {500, 89.24891838334393412424, 1555},
+                                         {578, 95.87182997389099625707, 583},
+                                         {625, 98.20470345147104751504, 405},
+                                         {700, 99.96818381983835877236, 56},
+                                         {711, 99.99970905482754801596, 1}};
+
+    tdigest_sample_compare(*result.second[0].results[0], expected);
+  }
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu
new file mode 100644
index 00000000000..39f7cc593d6
--- /dev/null
+++ b/cpp/tests/quantiles/percentile_approx_test.cu
@@ -0,0 +1,435 @@
+#include <arrow/util/tdigest.h>
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/groupby.hpp>
+#include <cudf/quantiles.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+using namespace cudf;
+
+struct tdigest_gen {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+    return std::move(result.second[0].results[0]);
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys, column_view const& values, int delta)
+  {
+    CUDF_FAIL("Invalid tdigest test type");
+  }
+};
+
+std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
+                                                int delta,
+                                                std::vector<double> const& percentages)
+{
+  // sort the incoming values using the same settings that groupby does.
+  // this is a little weak because null_order::AFTER is hardcoded internally to groupby.
+  table_view t({_values});
+  auto sorted_t      = cudf::sort(t, {}, {null_order::AFTER});
+  auto sorted_values = sorted_t->get_column(0).view();
+
+  std::vector<double> h_values(sorted_values.size());
+  cudaMemcpy(h_values.data(),
+             sorted_values.data<double>(),
+             sizeof(double) * sorted_values.size(),
+             cudaMemcpyDeviceToHost);
+  std::vector<char> h_validity(sorted_values.size());
+  if (sorted_values.null_mask() != nullptr) {
+    auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size());
+    cudaMemcpy(h_validity.data(),
+               (validity->view().data<char>()),
+               sizeof(char) * sorted_values.size(),
+               cudaMemcpyDeviceToHost);
+  }
+
+  // generate the tdigest
+  arrow::internal::TDigest atd(delta, sorted_values.size() * 2);
+  for (size_t idx = 0; idx < h_values.size(); idx++) {
+    if (sorted_values.null_mask() == nullptr || h_validity[idx]) { atd.Add(h_values[idx]); }
+  }
+
+  // generate the percentiles and stuff them into a list column
+  std::vector<double> h_result;
+  h_result.reserve(percentages.size());
+  std::transform(
+    percentages.begin(), percentages.end(), std::back_inserter(h_result), [&atd](double p) {
+      return atd.Quantile(p);
+    });
+  cudf::test::fixed_width_column_wrapper<double> result(h_result.begin(), h_result.end());
+  cudf::test::fixed_width_column_wrapper<size_type> offsets{
+    0, static_cast<size_type>(percentages.size())};
+  return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {});
+}
+
+struct percentile_approx_dispatch {
+  template <
+    typename T,
+    typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    // arrow implementation.
+    auto expected = [&]() {
+      // we're explicitly casting back to doubles here but this is ok because that is
+      // exactly what happens inside of the cudf implementation as values are processed as well. so
+      // this should not affect results.
+      auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64});
+      return arrow_percentile_approx(*as_doubles, delta, percentages);
+    }();
+
+    // gpu
+    cudf::table_view t({keys});
+    cudf::groupby::groupby gb(t);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+    requests.push_back({values, std::move(aggregations)});
+    auto gb_result = gb.aggregate(requests);
+
+    cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                                 percentages.end());
+    structs_column_view scv(*(gb_result.second[0].results[0]));
+    auto result = cudf::percentile_approx(scv, g_percentages);
+
+    cudf::test::expect_columns_equivalent(
+      *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps);
+
+    return result;
+  }
+
+  template <
+    typename T,
+    typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& keys,
+                                     column_view const& values,
+                                     int delta,
+                                     std::vector<double> const& percentages,
+                                     size_type ulps)
+  {
+    CUDF_FAIL("Invalid input type for percentile_approx test");
+  }
+};
+
+void percentile_approx_test(column_view const& _keys,
+                            column_view const& _values,
+                            int delta,
+                            std::vector<double> const& percentages,
+                            size_type ulps)
+{
+  // first pass:  validate the actual percentages we get per group.
+
+  // produce the groups
+  cudf::table_view k({_keys});
+  cudf::groupby::groupby pass1_gb(k);
+  cudf::table_view v({_values});
+  auto groups = pass1_gb.get_groups(v);
+  // slice it all up so we have keys/columns for everything.
+  std::vector<column_view> keys;
+  std::vector<column_view> values;
+  for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
+    auto k =
+      cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    keys.push_back(k[0]);
+
+    auto v =
+      cudf::slice(groups.values->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
+    values.push_back(v[0]);
+  }
+
+  std::vector<std::unique_ptr<column>> parts;
+  for (size_t idx = 0; idx < values.size(); idx++) {
+    // do any casting of the input
+    parts.push_back(cudf::type_dispatcher(values[idx].type(),
+                                          percentile_approx_dispatch{},
+                                          keys[idx],
+                                          values[idx],
+                                          delta,
+                                          percentages,
+                                          ulps));
+  }
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& c) { return c->view(); });
+  auto expected = cudf::concatenate(part_views);
+
+  // second pass. run the percentile_approx with all the keys in one pass and make sure we get the
+  // same results as the concatenated by-key results above
+
+  cudf::groupby::groupby gb(k);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({_values, std::move(aggregations)});
+  auto gb_result = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
+                                                               percentages.end());
+  structs_column_view scv(*(gb_result.second[0].results[0]));
+  auto result = cudf::percentile_approx(scv, g_percentages);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
+}
+
+void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+struct group_index {
+  __device__ int operator()(int i) { return i / 150000; }
+};
+
+void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+std::pair<rmm::device_buffer, size_type> make_null_mask(column_view const& col)
+{
+  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
+                                thrust::make_counting_iterator<size_type>(col.size()),
+                                [] __device__(size_type i) { return i % 2 == 0; });
+}
+
+void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+               keys->mutable_view().template begin<int>(),
+               keys->mutable_view().template end<int>(),
+               0);
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+{
+  auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  auto i = thrust::make_counting_iterator(0);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    i,
+                    i + values->size(),
+                    keys->mutable_view().template begin<int>(),
+                    group_index{});
+
+  // add a null mask
+  auto mask = make_null_mask(*values);
+  values->set_null_mask(mask.first, mask.second);
+
+  std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
+    percentile_approx_test(
+      *keys, *values, params.first, {0.0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0}, params.second);
+  });
+}
+
+template <typename T>
+data_type get_appropriate_type()
+{
+  if constexpr (cudf::is_fixed_point<T>()) { return data_type{cudf::type_to_id<T>(), -7}; }
+  return data_type{cudf::type_to_id<T>()};
+}
+
+using PercentileApproxTypes =
+  cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
+
+template <typename T>
+struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture {
+};
+TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes);
+
+TYPED_TEST(PercentileApproxInputTypesTest, Simple)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_test(input_type,
+              {{1000, cudf::test::default_ulp},
+               {100, cudf::test::default_ulp * 4},
+               {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, Grouped)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_test(input_type,
+               {{1000, cudf::test::default_ulp},
+                {100, cudf::test::default_ulp * 2},
+                {10, cudf::test::default_ulp * 10}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, SimpleWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  simple_with_nulls_test(input_type,
+                         {{1000, cudf::test::default_ulp},
+                          {100, cudf::test::default_ulp * 2},
+                          {10, cudf::test::default_ulp * 11}});
+}
+
+TYPED_TEST(PercentileApproxInputTypesTest, GroupedWithNulls)
+{
+  using T               = TypeParam;
+  auto const input_type = get_appropriate_type<T>();
+
+  grouped_with_nulls_test(input_type,
+                          {{1000, cudf::test::default_ulp},
+                           {100, cudf::test::default_ulp * 2},
+                           {10, cudf::test::default_ulp * 6}});
+}
+
+struct PercentileApproxTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(PercentileApproxTest, EmptyInput)
+{
+  auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column();
+  cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
+
+  std::vector<column_view> input;
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  input.push_back(*empty_);
+  auto empty = cudf::concatenate(input);
+
+  structs_column_view scv(*empty);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
+  std::vector<bool> nulls{0, 0, 0};
+  auto expected =
+    cudf::make_lists_column(3,
+                            offsets.release(),
+                            cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                            3,
+                            cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, EmptyPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{0, 1, 2, 3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  cudf::test::fixed_width_column_wrapper<double> percentiles{};
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+  auto result = cudf::percentile_approx(scv, percentiles);
+
+  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
+  auto expected = cudf::make_lists_column(2,
+                                          offsets.release(),
+                                          cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                                          2,
+                                          cudf::detail::create_null_mask(2, mask_state::ALL_NULL));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
+}
+
+TEST_F(PercentileApproxTest, NullPercentiles)
+{
+  auto const delta = 1000;
+
+  cudf::test::fixed_width_column_wrapper<double> values{1, 1, 2, 3, 4, 5, 6, 7, 8};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 1, 1, 1, 1};
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto tdigest_column = gb.aggregate(requests);
+
+  structs_column_view scv(*tdigest_column.second[0].results[0]);
+
+  cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
+  auto result = cudf::percentile_approx(scv, npercentiles);
+
+  std::vector<bool> valids{0, 0, 1, 1};
+  cudf::test::lists_column_wrapper<double> expected{{{99, 99, 4, 4}, valids.begin()},
+                                                    {{99, 99, 8, 8}, valids.begin()}};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
+}
\ No newline at end of file
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index f3002bc4b1a..0f10d6efe4a 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -323,7 +323,8 @@ class corresponding_rows_unequal {
   corresponding_rows_unequal(table_device_view d_lhs,
                              table_device_view d_rhs,
                              column_device_view lhs_row_indices_,
-                             column_device_view rhs_row_indices_)
+                             column_device_view rhs_row_indices_,
+                             size_type /*fp_ulps*/)
     : comp(d_lhs, d_rhs), lhs_row_indices(lhs_row_indices_), rhs_row_indices(rhs_row_indices_)
   {
   }
@@ -347,16 +348,20 @@ class corresponding_rows_not_equivalent {
   column_device_view lhs_row_indices;
   column_device_view rhs_row_indices;
 
+  size_type const fp_ulps;
+
  public:
   corresponding_rows_not_equivalent(table_device_view d_lhs,
                                     table_device_view d_rhs,
                                     column_device_view lhs_row_indices_,
-                                    column_device_view rhs_row_indices_)
+                                    column_device_view rhs_row_indices_,
+                                    size_type fp_ulps_)
     : d_lhs(d_lhs),
       d_rhs(d_rhs),
       comp(d_lhs, d_rhs),
       lhs_row_indices(lhs_row_indices_),
-      rhs_row_indices(rhs_row_indices_)
+      rhs_row_indices(rhs_row_indices_),
+      fp_ulps(fp_ulps_)
   {
     CUDF_EXPECTS(d_lhs.num_columns() == 1 and d_rhs.num_columns() == 1,
                  "Unsupported number of columns");
@@ -368,7 +373,8 @@ class corresponding_rows_not_equivalent {
       column_device_view const& lhs,
       column_device_view const& rhs,
       size_type lhs_index,
-      size_type rhs_index)
+      size_type rhs_index,
+      size_type fp_ulps)
     {
       if (lhs.is_valid(lhs_index) and rhs.is_valid(rhs_index)) {
         T const x = lhs.element<T>(lhs_index);
@@ -380,10 +386,9 @@ class corresponding_rows_not_equivalent {
         } else if (std::isnan(x) || std::isnan(y)) {
           return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
         } else {
-          constexpr int ulp     = 4;  // ulp = unit of least precision, value taken from google test
           T const abs_x_minus_y = std::abs(x - y);
           return abs_x_minus_y >= std::numeric_limits<T>::min() &&
-                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * ulp;
+                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * fp_ulps;
         }
       } else {
         // if either is null, then the inequality was checked already
@@ -409,8 +414,13 @@ class corresponding_rows_not_equivalent {
     if (not comp(lhs_index, rhs_index)) {
       auto lhs_col = this->d_lhs.column(0);
       auto rhs_col = this->d_rhs.column(0);
-      return type_dispatcher(
-        lhs_col.type(), typed_element_not_equivalent{}, lhs_col, rhs_col, lhs_index, rhs_index);
+      return type_dispatcher(lhs_col.type(),
+                             typed_element_not_equivalent{},
+                             lhs_col,
+                             rhs_col,
+                             lhs_index,
+                             rhs_index,
+                             fp_ulps);
     }
     return false;
   }
@@ -468,6 +478,7 @@ struct column_comparator_impl {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     auto d_lhs = cudf::table_device_view::create(table_view{{lhs}});
@@ -483,12 +494,12 @@ struct column_comparator_impl {
     auto differences = rmm::device_uvector<int>(
       lhs.size(), rmm::cuda_stream_default);  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
-    auto diff_iter =
-      thrust::copy_if(rmm::exec_policy(),
-                      input_iter,
-                      input_iter + lhs_row_indices.size(),
-                      differences.begin(),
-                      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices));
+    auto diff_iter  = thrust::copy_if(
+      rmm::exec_policy(),
+      input_iter,
+      input_iter + lhs_row_indices.size(),
+      differences.begin(),
+      ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
                        rmm::cuda_stream_default);  // shrink back down
@@ -519,6 +530,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     lists_column_view lhs_l(lhs);
@@ -638,6 +650,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
                                    *lhs_child_indices,
                                    *rhs_child_indices,
                                    verbosity,
+                                   fp_ulps,
                                    depth + 1);
     }
 
@@ -652,6 +665,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth)
   {
     structs_column_view l_scv(lhs);
@@ -667,6 +681,7 @@ struct column_comparator_impl<struct_view, check_exact_equality> {
                                  lhs_row_indices,
                                  rhs_row_indices,
                                  verbosity,
+                                 fp_ulps,
                                  depth + 1)) {
         return false;
       }
@@ -683,6 +698,7 @@ struct column_comparator {
                   column_view const& lhs_row_indices,
                   column_view const& rhs_row_indices,
                   debug_output_level verbosity,
+                  size_type fp_ulps,
                   int depth = 0)
   {
     CUDF_EXPECTS(lhs_row_indices.size() == rhs_row_indices.size(),
@@ -701,7 +717,7 @@ struct column_comparator {
 
     // compare values
     column_comparator_impl<T, check_exact_equality> comparator{};
-    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, depth);
+    return comparator(lhs, rhs, lhs_row_indices, rhs_row_indices, verbosity, fp_ulps, depth);
   }
 };
 
@@ -750,8 +766,14 @@ bool expect_columns_equal(cudf::column_view const& lhs,
                           debug_output_level verbosity)
 {
   auto indices = generate_all_row_indices(lhs.size());
-  return cudf::type_dispatcher(
-    lhs.type(), column_comparator<true>{}, lhs, rhs, *indices, *indices, verbosity);
+  return cudf::type_dispatcher(lhs.type(),
+                               column_comparator<true>{},
+                               lhs,
+                               rhs,
+                               *indices,
+                               *indices,
+                               verbosity,
+                               cudf::test::default_ulp);
 }
 
 /**
@@ -759,11 +781,12 @@ bool expect_columns_equal(cudf::column_view const& lhs,
  */
 bool expect_columns_equivalent(cudf::column_view const& lhs,
                                cudf::column_view const& rhs,
-                               debug_output_level verbosity)
+                               debug_output_level verbosity,
+                               size_type fp_ulps)
 {
   auto indices = generate_all_row_indices(lhs.size());
   return cudf::type_dispatcher(
-    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity);
+    lhs.type(), column_comparator<false>{}, lhs, rhs, *indices, *indices, verbosity, fp_ulps);
 }
 
 /**