diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c87033238c7..e993f548e1d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 repos:
-      - repo: https://github.com/pycqa/isort
+      - repo: https://github.com/PyCQA/isort
         rev: 5.6.4
         hooks:
               - id: isort
@@ -27,12 +27,12 @@ repos:
                 name: isort-dask-cudf
                 args: ["--settings-path=python/dask_cudf/setup.cfg"]
                 files: python/dask_cudf/.*
-      - repo: https://github.com/ambv/black
+      - repo: https://github.com/psf/black
         rev: 19.10b0
         hooks:
               - id: black
                 files: python/.*
-      - repo: https://gitlab.com/pycqa/flake8
+      - repo: https://github.com/PyCQA/flake8
         rev: 3.8.3
         hooks:
               - id: flake8
@@ -45,27 +45,49 @@ repos:
                 name: flake8-cython
                 args: ["--config=python/.flake8.cython"]
                 types: [cython]
-      - repo: local
-        hooks:
-              - id: clang-format
-                name: clang-format
-                description: Format files with ClangFormat.
-                entry: clang-format -i
-                language: system
-                files: \.(cu|cuh|h|hpp|cpp|inl)$
-                args: ['-fallback-style=none']
       - repo: https://github.com/pre-commit/mirrors-mypy
         rev: 'v0.782'
         hooks:
               - id: mypy
                 args: ["--config-file=python/cudf/setup.cfg", "python/cudf/cudf"]
                 pass_filenames: false
-      - repo: https://github.com/pycqa/pydocstyle
-        rev: 6.0.0
+      - repo: https://github.com/PyCQA/pydocstyle
+        rev: 6.1.1
         hooks:
               - id: pydocstyle
                 args: ["--config=python/.flake8"]
-
+      - repo: local
+        hooks:
+              - id: clang-format
+                # Using the pre-commit stage to simplify invocation of all
+                # other hooks simultaneously (via any other hook stage).  This
+                # can be removed if we also move to running clang-format
+                # entirely through pre-commit.
+                stages: [commit]
+                name: clang-format
+                description: Format files with ClangFormat.
+                entry: clang-format -i
+                language: system
+                files: \.(cu|cuh|h|hpp|cpp|inl)$
+                args: ['-fallback-style=none']
+              - id: cmake-format
+                name: cmake-format
+                entry: bash cpp/scripts/run-cmake-format.sh cmake-format
+                language: python
+                types: [cmake]
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmake-format==0.6.11
+              - id: cmake-lint
+                name: cmake-lint
+                entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
+                language: python
+                types: [cmake]
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmake-format==0.6.11
 
 default_language_version:
       python: python3
diff --git a/build.sh b/build.sh
index c9333a3e2af..d0ccd4821e0 100755
--- a/build.sh
+++ b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 REPODIR=$(cd $(dirname $0); pwd)
 
 VALIDARGS="clean libcudf cudf dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n -l --allgpuarch --disable_nvtx --show_depr_warn --ptds -h"
-HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\"<args>\"]
+HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [-l] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index f99c26bdc63..67e926a0768 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -14,119 +14,18 @@ LANG=C.UTF-8
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-# Run isort-cudf and get results/return code
-ISORT_CUDF=`isort python/cudf --check-only --settings-path=python/cudf/setup.cfg 2>&1`
-ISORT_CUDF_RETVAL=$?
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/main/cmake-format-rapids-cmake.json
+export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
+mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
+wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
 
-# Run isort-cudf-kafka and get results/return code
-ISORT_CUDF_KAFKA=`isort python/cudf_kafka --check-only --settings-path=python/cudf_kafka/setup.cfg 2>&1`
-ISORT_CUDF_KAFKA_RETVAL=$?
-
-# Run isort-custreamz and get results/return code
-ISORT_CUSTREAMZ=`isort python/custreamz --check-only --settings-path=python/custreamz/setup.cfg 2>&1`
-ISORT_CUSTREAMZ_RETVAL=$?
-
-# Run isort-dask-cudf and get results/return code
-ISORT_DASK_CUDF=`isort python/dask_cudf --check-only --settings-path=python/dask_cudf/setup.cfg 2>&1`
-ISORT_DASK_CUDF_RETVAL=$?
-
-# Run black and get results/return code
-BLACK=`black --check python 2>&1`
-BLACK_RETVAL=$?
-
-# Run flake8 and get results/return code
-FLAKE=`flake8 --config=python/.flake8 python 2>&1`
-FLAKE_RETVAL=$?
-
-# Run flake8-cython and get results/return code
-FLAKE_CYTHON=`flake8 --config=python/.flake8.cython 2>&1`
-FLAKE_CYTHON_RETVAL=$?
-
-# Run mypy and get results/return code
-MYPY_CUDF=`mypy --config=python/cudf/setup.cfg python/cudf/cudf 2>&1`
-MYPY_CUDF_RETVAL=$?
-
-# Run pydocstyle and get results/return code
-PYDOCSTYLE=`pydocstyle --config=python/.flake8 python 2>&1`
-PYDOCSTYLE_RETVAL=$?
+pre-commit run --hook-stage manual --all-files
+PRE_COMMIT_RETVAL=$?
 
 # Run clang-format and check for a consistent code format
 CLANG_FORMAT=`python cpp/scripts/run-clang-format.py 2>&1`
 CLANG_FORMAT_RETVAL=$?
 
-# Output results if failure otherwise show pass
-if [ "$ISORT_CUDF_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: isort-cudf style check; begin output\n\n"
-  echo -e "$ISORT_CUDF"
-  echo -e "\n\n>>>> FAILED: isort-cudf style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: isort-cudf style check\n\n"
-fi
-
-if [ "$ISORT_CUDF_KAFKA_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: isort-cudf-kafka style check; begin output\n\n"
-  echo -e "$ISORT_CUDF_KAFKA"
-  echo -e "\n\n>>>> FAILED: isort-cudf-kafka style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: isort-cudf-kafka style check\n\n"
-fi
-
-if [ "$ISORT_CUSTREAMZ_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: isort-custreamz style check; begin output\n\n"
-  echo -e "$ISORT_CUSTREAMZ"
-  echo -e "\n\n>>>> FAILED: isort-custreamz style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: isort-custreamz style check\n\n"
-fi
-
-if [ "$ISORT_DASK_CUDF_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: isort-dask-cudf style check; begin output\n\n"
-  echo -e "$ISORT_DASK_CUDF"
-  echo -e "\n\n>>>> FAILED: isort-dask-cudf style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: isort-dask-cudf style check\n\n"
-fi
-
-if [ "$BLACK_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: black style check; begin output\n\n"
-  echo -e "$BLACK"
-  echo -e "\n\n>>>> FAILED: black style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: black style check\n\n"
-fi
-
-if [ "$FLAKE_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: flake8 style check; begin output\n\n"
-  echo -e "$FLAKE"
-  echo -e "\n\n>>>> FAILED: flake8 style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: flake8 style check\n\n"
-fi
-
-if [ "$FLAKE_CYTHON_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: flake8-cython style check; begin output\n\n"
-  echo -e "$FLAKE_CYTHON"
-  echo -e "\n\n>>>> FAILED: flake8-cython style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: flake8-cython style check\n\n"
-fi
-
-if [ "$MYPY_CUDF_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: mypy style check; begin output\n\n"
-  echo -e "$MYPY_CUDF"
-  echo -e "\n\n>>>> FAILED: mypy style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: mypy style check\n\n"
-fi
-
-if [ "$PYDOCSTYLE_RETVAL" != "0" ]; then
-  echo -e "\n\n>>>> FAILED: pydocstyle style check; begin output\n\n"
-  echo -e "$PYDOCSTYLE"
-  echo -e "\n\n>>>> FAILED: pydocstyle style check; end output\n\n"
-else
-  echo -e "\n\n>>>> PASSED: pydocstyle style check\n\n"
-fi
-
 if [ "$CLANG_FORMAT_RETVAL" != "0" ]; then
   echo -e "\n\n>>>> FAILED: clang format check; begin output\n\n"
   echo -e "$CLANG_FORMAT"
@@ -141,9 +40,7 @@ HEADER_META_RETVAL=$?
 echo -e "$HEADER_META"
 
 RETVALS=(
-  $ISORT_CUDF_RETVAL $ISORT_CUDF_KAFKA_RETVAL $ISORT_CUSTREAMZ_RETVAL $ISORT_DASK_CUDF_RETVAL
-  $BLACK_RETVAL $FLAKE_RETVAL $FLAKE_CYTHON_RETVAL $PYDOCSTYLE_RETVAL $CLANG_FORMAT_RETVAL
-  $HEADER_META_RETVAL $MYPY_CUDF_RETVAL
+  $PRE_COMMIT_RETVAL $CLANG_FORMAT_RETVAL
 )
 IFS=$'\n'
 RETVAL=`echo "${RETVALS[*]}" | sort -nr | head -n1`
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 631801bc8d4..e2e95c34650 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -83,7 +83,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.22.*"
+                  "ucx-py=0.23.*"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index b46817bb9ab..0fd3f790f9f 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -80,7 +80,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.22.*" \
+                  "ucx-py=0.23.*" \
                   "openjdk=8.*" \
                   "maven"
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index b52b246af25..e6760317b4f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -31,6 +31,7 @@ function sed_runner() {
 
 # cpp update
 sed_runner 's/'"CUDF VERSION .* LANGUAGES"'/'"CUDF VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
+sed_runner 's/'"branch-.*\/RAPIDS.cmake"'/'"branch-${NEXT_SHORT_TAG}\/RAPIDS.cmake"'/g' cpp/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"CUDA_KAFKA VERSION .* LANGUAGES"'/'"CUDA_KAFKA VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/libcudf_kafka/CMakeLists.txt
diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
index 5e839589811..803e4f0ba26 100644
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ b/conda/environments/cudf_dev_cuda11.0.yml
@@ -38,8 +38,9 @@ dependencies:
   - black=19.10
   - isort=5.6.4
   - mypy=0.782
+  - pydocstyle=6.1.1
   - typing_extensions
-  - pre_commit
+  - pre-commit
   - dask>=2021.09.1
   - distributed>=2021.09.1
   - streamz
@@ -58,6 +59,8 @@ dependencies:
   - cachetools
   - transformers<=4.10.3
   - pydata-sphinx-theme
+  - librdkafka=1.7.0
+  - python-confluent-kafka=1.7.0
   - pip:
       - git+https://github.com/dask/dask.git@main
       - git+https://github.com/dask/distributed.git@main
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
index 28266b6af87..2281d361ebd 100644
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ b/conda/environments/cudf_dev_cuda11.2.yml
@@ -38,8 +38,9 @@ dependencies:
   - black=19.10
   - isort=5.6.4
   - mypy=0.782
+  - pydocstyle=6.1.1
   - typing_extensions
-  - pre_commit
+  - pre-commit
   - dask>=2021.09.1
   - distributed>=2021.09.1
   - streamz
@@ -58,6 +59,8 @@ dependencies:
   - cachetools
   - transformers<=4.10.3
   - pydata-sphinx-theme
+  - librdkafka=1.7.0
+  - python-confluent-kafka=1.7.0
   - pip:
       - git+https://github.com/dask/dask.git@main
       - git+https://github.com/dask/distributed.git@main
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
new file mode 100644
index 00000000000..63800fe786b
--- /dev/null
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -0,0 +1,68 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+name: cudf_dev
+channels:
+  - rapidsai
+  - nvidia
+  - rapidsai-nightly
+  - conda-forge
+dependencies:
+  - clang=11.0.0
+  - clang-tools=11.0.0
+  - cupy>7.1.0,<10.0.0a0
+  - rmm=21.12.*
+  - cmake>=3.20.1
+  - cmake_setuptools>=0.1.3
+  - python>=3.7,<3.9
+  - numba>=0.53.1
+  - numpy
+  - pandas>=1.0,<1.4.0dev0
+  - pyarrow=5.0.0=*cuda
+  - fastavro>=0.22.9
+  - python-snappy>=0.6.0
+  - notebook>=0.5.0
+  - cython>=0.29,<0.30
+  - fsspec>=0.6.0
+  - pytest
+  - pytest-benchmark
+  - pytest-xdist
+  - sphinx
+  - sphinxcontrib-websupport
+  - nbsphinx
+  - numpydoc
+  - ipython
+  - pandoc=<2.0.0
+  - cudatoolkit=11.5
+  - pip
+  - flake8=3.8.3
+  - black=19.10
+  - isort=5.6.4
+  - mypy=0.782
+  - pydocstyle=6.1.1
+  - typing_extensions
+  - pre-commit
+  - dask>=2021.09.1
+  - distributed>=2021.09.1
+  - streamz
+  - arrow-cpp=5.0.0
+  - dlpack>=0.5,<0.6.0a0
+  - arrow-cpp-proc * cuda
+  - double-conversion
+  - rapidjson
+  - hypothesis
+  - sphinx-markdown-tables
+  - sphinx-copybutton
+  - mimesis<4.1
+  - packaging
+  - protobuf
+  - nvtx>=0.2.1
+  - cachetools
+  - transformers<=4.10.3
+  - pydata-sphinx-theme
+  - librdkafka=1.7.0
+  - python-confluent-kafka=1.7.0
+  - pip:
+      - git+https://github.com/dask/dask.git@main
+      - git+https://github.com/dask/distributed.git@main
+      - git+https://github.com/python-streamz/streamz.git@master
+      - pyorc
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index ed3f8fa7139..5631e262b87 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -14,7 +14,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda_{{ cuda_version }}_py{{ py_version }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - VERSION_SUFFIX
     - PARALLEL_LEVEL
@@ -28,11 +28,13 @@ requirements:
     - cudf {{ version }}
     - dask>=2021.09.1
     - distributed>=2021.09.1
+    - cudatoolkit {{ cuda_version }}
   run:
     - python
     - cudf {{ version }}
     - dask>=2021.09.1
     - distributed>=2021.09.1
+    - {{ pin_compatible('cudatoolkit', max_pin='x.x') }}
 
 test:                                   # [linux64]
   requires:                             # [linux64]
diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
index 5110b4e289b..703f8dc15c7 100644
--- a/conda/recipes/libcudf/build.sh
+++ b/conda/recipes/libcudf/build.sh
@@ -2,7 +2,7 @@
 
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     # This assumes the script is executed from the root of the repo directory
-    ./build.sh -v libcudf --allgpuarch
+    ./build.sh -v libcudf --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
 else
-    ./build.sh -v libcudf tests --allgpuarch
-fi
\ No newline at end of file
+    ./build.sh -v libcudf tests --allgpuarch --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib\"
+fi
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index fd687de6698..d644369c264 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -55,7 +55,6 @@ test:
     - test -f $PREFIX/include/cudf/ast/detail/operators.hpp
     - test -f $PREFIX/include/cudf/ast/expressions.hpp
     - test -f $PREFIX/include/cudf/binaryop.hpp
-    - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
     - test -f $PREFIX/include/cudf/column/column_factories.hpp
     - test -f $PREFIX/include/cudf/column/column.hpp
     - test -f $PREFIX/include/cudf/column/column_view.hpp
@@ -66,6 +65,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/aggregation/result_cache.hpp
     - test -f $PREFIX/include/cudf/detail/label_bins.hpp
     - test -f $PREFIX/include/cudf/detail/binaryop.hpp
+    - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
     - test -f $PREFIX/include/cudf/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/detail/copy.hpp
     - test -f $PREFIX/include/cudf/detail/datetime.hpp
@@ -93,6 +93,7 @@ test:
     - test -f $PREFIX/include/cudf/detail/sequence.hpp
     - test -f $PREFIX/include/cudf/detail/sorting.hpp
     - test -f $PREFIX/include/cudf/detail/stream_compaction.hpp
+    - test -f $PREFIX/include/cudf/detail/structs/utilities.hpp
     - test -f $PREFIX/include/cudf/detail/tdigest/tdigest.hpp
     - test -f $PREFIX/include/cudf/detail/transform.hpp
     - test -f $PREFIX/include/cudf/detail/transpose.hpp
@@ -141,6 +142,7 @@ test:
     - test -f $PREFIX/include/cudf/io/types.hpp
     - test -f $PREFIX/include/cudf/ipc.hpp
     - test -f $PREFIX/include/cudf/join.hpp
+    - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
     - test -f $PREFIX/include/cudf/lists/detail/combine.hpp
     - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
@@ -187,6 +189,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/convert/convert_floats.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_integers.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_ipv4.hpp
+    - test -f $PREFIX/include/cudf/strings/convert/convert_lists.hpp
     - test -f $PREFIX/include/cudf/strings/convert/convert_urls.hpp
     - test -f $PREFIX/include/cudf/strings/detail/combine.hpp
     - test -f $PREFIX/include/cudf/strings/detail/concatenate.hpp
@@ -202,6 +205,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/find_multiple.hpp
     - test -f $PREFIX/include/cudf/strings/json.hpp
     - test -f $PREFIX/include/cudf/strings/padding.hpp
+    - test -f $PREFIX/include/cudf/strings/regex/flags.hpp
     - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
     - test -f $PREFIX/include/cudf/strings/replace.hpp
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
@@ -218,6 +222,7 @@ test:
     - test -f $PREFIX/include/cudf/structs/detail/concatenate.hpp
     - test -f $PREFIX/include/cudf/table/table.hpp
     - test -f $PREFIX/include/cudf/table/table_view.hpp
+    - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.cuh
     - test -f $PREFIX/include/cudf/transform.hpp
     - test -f $PREFIX/include/cudf/transpose.hpp
     - test -f $PREFIX/include/cudf/types.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3b7bc8a223c..cf7b5be0e3e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,23 +1,22 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
-file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
-    ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
+     ${CMAKE_BINARY_DIR}/RAPIDS.cmake
+)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 
 include(rapids-cmake)
@@ -28,15 +27,19 @@ include(rapids-find)
 
 rapids_cuda_init_architectures(CUDF)
 
-project(CUDF VERSION 21.12.00 LANGUAGES C CXX CUDA)
+project(
+  CUDF
+  VERSION 21.12.00
+  LANGUAGES C CXX CUDA
+)
 
-# Needed because GoogleBenchmark changes the state of FindThreads.cmake,
-# causing subsequent runs to have different values for the `Threads::Threads` target.
-# Setting this flag ensures `Threads::Threads` is the same value in first run and subsequent runs.
+# Needed because GoogleBenchmark changes the state of FindThreads.cmake, causing subsequent runs to
+# have different values for the `Threads::Threads` target. Setting this flag ensures
+# `Threads::Threads` is the same value in first run and subsequent runs.
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 
-###################################################################################################
-# - build options ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * build options ---------------------------------------------------------------------------------
 
 option(USE_NVTX "Build with NVTX support" ON)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
@@ -50,8 +53,11 @@ option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OF
 option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" ON)
 option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
 option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF)
-# Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
-option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
+# Option to enable line info in CUDA device compilation to allow introspection when profiling /
+# memchecking
+option(CUDA_ENABLE_LINEINFO
+       "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF
+)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
@@ -63,8 +69,14 @@ message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_C
 message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
 message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}")
 message(VERBOSE "CUDF: Build with per-thread default stream: ${PER_THREAD_DEFAULT_STREAM}")
-message(VERBOSE "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNING}")
-message(VERBOSE "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler: ${CUDA_ENABLE_LINEINFO}")
+message(
+  VERBOSE
+  "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNING}"
+)
+message(
+  VERBOSE
+  "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler: ${CUDA_ENABLE_LINEINFO}"
+)
 message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 
 # Set a default build type if none was specified
@@ -78,36 +90,50 @@ set(CUDF_CXX_DEFINITIONS "")
 set(CUDF_CUDA_DEFINITIONS "")
 
 # Set RMM logging level
-set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
-set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
+set(RMM_LOGGING_LEVEL
+    "INFO"
+    CACHE STRING "Choose the logging level."
+)
+set_property(
+  CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF"
+)
 message(VERBOSE "CUDF: RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 
-if (NOT CUDF_GENERATED_INCLUDE_DIR)
-    set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR})
+if(NOT CUDF_GENERATED_INCLUDE_DIR)
+  set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR})
 endif()
 
-###################################################################################################
-# - conda environment -----------------------------------------------------------------------------
+# ##################################################################################################
+# * conda environment -----------------------------------------------------------------------------
 rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH)
 
-###################################################################################################
-# - compiler options ------------------------------------------------------------------------------
-rapids_find_package(CUDAToolkit REQUIRED
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports)
+# ##################################################################################################
+# * compiler options ------------------------------------------------------------------------------
+rapids_find_package(
+  CUDAToolkit REQUIRED
+  BUILD_EXPORT_SET cudf-exports
+  INSTALL_EXPORT_SET cudf-exports
+)
 include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags
 
+# ctest cuda memcheck
+find_program(CUDA_SANITIZER compute-sanitizer)
+set(MEMORYCHECK_COMMAND ${CUDA_SANITIZER})
+set(MEMORYCHECK_TYPE CudaSanitizer)
+set(CUDA_SANITIZER_COMMAND_OPTIONS "--tool memcheck")
 
-###################################################################################################
-# - dependencies ----------------------------------------------------------------------------------
+# ##################################################################################################
+# * dependencies ----------------------------------------------------------------------------------
 
 # find zlib
 rapids_find_package(ZLIB REQUIRED)
 
 # find Threads (needed by cudftestutil)
-rapids_find_package(Threads REQUIRED
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports)
+rapids_find_package(
+  Threads REQUIRED
+  BUILD_EXPORT_SET cudf-exports
+  INSTALL_EXPORT_SET cudf-exports
+)
 
 # add third party dependencies using CPM
 rapids_cpm_init()
@@ -125,8 +151,7 @@ include(cmake/thirdparty/get_arrow.cmake)
 include(cmake/thirdparty/get_dlpack.cmake)
 # find libcu++
 include(cmake/thirdparty/get_libcudacxx.cmake)
-# find cuCollections
-# Should come after including thrust and libcudacxx
+# find cuCollections Should come after including thrust and libcudacxx
 include(cmake/thirdparty/get_cucollections.cmake)
 # find or install GoogleTest
 include(cmake/thirdparty/get_gtest.cmake)
@@ -135,350 +160,358 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
 include(cmake/Modules/FindcuFile.cmake)
 
-###################################################################################################
-# - library targets -------------------------------------------------------------------------------
-
-add_library(cudf
-    src/aggregation/aggregation.cpp
-    src/aggregation/aggregation.cu
-    src/aggregation/result_cache.cpp
-    src/ast/expression_parser.cpp
-    src/ast/expressions.cpp
-    src/binaryop/binaryop.cpp
-    src/binaryop/compiled/binary_ops.cu
-    src/binaryop/compiled/Add.cu
-    src/binaryop/compiled/ATan2.cu
-    src/binaryop/compiled/BitwiseAnd.cu
-    src/binaryop/compiled/BitwiseOr.cu
-    src/binaryop/compiled/BitwiseXor.cu
-    src/binaryop/compiled/Less.cu
-    src/binaryop/compiled/Greater.cu
-    src/binaryop/compiled/LessEqual.cu
-    src/binaryop/compiled/GreaterEqual.cu
-    src/binaryop/compiled/Div.cu
-    src/binaryop/compiled/equality_ops.cu
-    src/binaryop/compiled/FloorDiv.cu
-    src/binaryop/compiled/LogBase.cu
-    src/binaryop/compiled/LogicalAnd.cu
-    src/binaryop/compiled/LogicalOr.cu
-    src/binaryop/compiled/Mod.cu
-    src/binaryop/compiled/Mul.cu
-    src/binaryop/compiled/NullMax.cu
-    src/binaryop/compiled/NullMin.cu
-    src/binaryop/compiled/PMod.cu
-    src/binaryop/compiled/Pow.cu
-    src/binaryop/compiled/PyMod.cu
-    src/binaryop/compiled/ShiftLeft.cu
-    src/binaryop/compiled/ShiftRight.cu
-    src/binaryop/compiled/ShiftRightUnsigned.cu
-    src/binaryop/compiled/Sub.cu
-    src/binaryop/compiled/TrueDiv.cu
-    src/binaryop/compiled/util.cpp
-    src/labeling/label_bins.cu
-    src/bitmask/null_mask.cu
-    src/bitmask/is_element_valid.cpp
-    src/column/column.cu
-    src/column/column_device_view.cu
-    src/column/column_factories.cpp
-    src/column/column_factories.cu
-    src/column/column_view.cpp
-    src/comms/ipc/ipc.cpp
-    src/copying/concatenate.cu
-    src/copying/contiguous_split.cu
-    src/copying/copy.cpp
-    src/copying/copy.cu
-    src/copying/copy_range.cu
-    src/copying/gather.cu
-    src/copying/get_element.cu
-    src/copying/pack.cpp
-    src/copying/reverse.cu
-    src/copying/sample.cu
-    src/copying/scatter.cu
-    src/copying/shift.cu
-    src/copying/slice.cu
-    src/copying/split.cpp
-    src/copying/segmented_shift.cu
-    src/datetime/datetime_ops.cu
-    src/dictionary/add_keys.cu
-    src/dictionary/decode.cu
-    src/dictionary/detail/concatenate.cu
-    src/dictionary/detail/merge.cu
-    src/dictionary/dictionary_column_view.cpp
-    src/dictionary/dictionary_factories.cu
-    src/dictionary/encode.cu
-    src/dictionary/remove_keys.cu
-    src/dictionary/replace.cu
-    src/dictionary/search.cu
-    src/dictionary/set_keys.cu
-    src/filling/fill.cu
-    src/filling/repeat.cu
-    src/filling/sequence.cu
-    src/groupby/groupby.cu
-    src/groupby/hash/groupby.cu
-    src/groupby/sort/aggregate.cpp
-    src/groupby/sort/group_argmax.cu
-    src/groupby/sort/group_argmin.cu
-    src/groupby/sort/group_collect.cu
-    src/groupby/sort/group_count.cu
-    src/groupby/sort/group_m2.cu
-    src/groupby/sort/group_max.cu
-    src/groupby/sort/group_min.cu
-    src/groupby/sort/group_merge_lists.cu
-    src/groupby/sort/group_merge_m2.cu
-    src/groupby/sort/group_nth_element.cu
-    src/groupby/sort/group_nunique.cu
-    src/groupby/sort/group_product.cu
-    src/groupby/sort/group_quantiles.cu
-    src/groupby/sort/group_std.cu
-    src/groupby/sort/group_sum.cu
-    src/groupby/sort/scan.cpp
-    src/groupby/sort/group_count_scan.cu
-    src/groupby/sort/group_max_scan.cu
-    src/groupby/sort/group_min_scan.cu
-    src/groupby/sort/group_rank_scan.cu
-    src/groupby/sort/group_replace_nulls.cu
-    src/groupby/sort/group_sum_scan.cu
-    src/groupby/sort/group_tdigest.cu
-    src/groupby/sort/sort_helper.cu
-    src/hash/hashing.cu
-    src/hash/md5_hash.cu
-    src/hash/murmur_hash.cu
-    src/interop/dlpack.cpp
-    src/interop/from_arrow.cu
-    src/interop/to_arrow.cu
-    src/interop/detail/arrow_allocator.cpp
-    src/io/avro/avro.cpp
-    src/io/avro/avro_gpu.cu
-    src/io/avro/reader_impl.cu
-    src/io/comp/brotli_dict.cpp
-    src/io/comp/cpu_unbz2.cpp
-    src/io/comp/debrotli.cu
-    src/io/comp/gpuinflate.cu
-    src/io/comp/snap.cu
-    src/io/comp/uncomp.cpp
-    src/io/comp/unsnap.cu
-    src/io/csv/csv_gpu.cu
-    src/io/csv/durations.cu
-    src/io/csv/reader_impl.cu
-    src/io/csv/writer_impl.cu
-    src/io/functions.cpp
-    src/io/json/json_gpu.cu
-    src/io/json/reader_impl.cu
-    src/io/orc/dict_enc.cu
-    src/io/orc/orc.cpp
-    src/io/orc/reader_impl.cu
-    src/io/orc/stats_enc.cu
-    src/io/orc/stripe_data.cu
-    src/io/orc/stripe_enc.cu
-    src/io/orc/stripe_init.cu
-    src/io/orc/timezone.cpp
-    src/io/orc/writer_impl.cu
-    src/io/parquet/compact_protocol_writer.cpp
-    src/io/parquet/page_data.cu
-    src/io/parquet/chunk_dict.cu
-    src/io/parquet/page_enc.cu
-    src/io/parquet/page_hdr.cu
-    src/io/parquet/parquet.cpp
-    src/io/parquet/reader_impl.cu
-    src/io/parquet/writer_impl.cu
-    src/io/statistics/orc_column_statistics.cu
-    src/io/statistics/parquet_column_statistics.cu
-    src/io/text/multibyte_split.cu
-    src/io/utilities/column_buffer.cpp
-    src/io/utilities/data_sink.cpp
-    src/io/utilities/datasource.cpp
-    src/io/utilities/file_io_utilities.cpp
-    src/io/utilities/parsing_utils.cu
-    src/io/utilities/trie.cu
-    src/io/utilities/type_conversion.cpp
-    src/jit/cache.cpp
-    src/jit/parser.cpp
-    src/jit/type.cpp
-    src/join/conditional_join.cu
-    src/join/cross_join.cu
-    src/join/hash_join.cu
-    src/join/join.cu
-    src/join/join_utils.cu
-    src/join/semi_join.cu
-    src/lists/contains.cu
-    src/lists/combine/concatenate_list_elements.cu
-    src/lists/combine/concatenate_rows.cu
-    src/lists/copying/concatenate.cu
-    src/lists/copying/copying.cu
-    src/lists/copying/gather.cu
-    src/lists/copying/segmented_gather.cu
-    src/lists/copying/scatter_helper.cu
-    src/lists/count_elements.cu
-    src/lists/drop_list_duplicates.cu
-    src/lists/explode.cu
-    src/lists/extract.cu
-    src/lists/interleave_columns.cu
-    src/lists/lists_column_factories.cu
-    src/lists/lists_column_view.cu
-    src/lists/segmented_sort.cu
-    src/merge/merge.cu
-    src/partitioning/partitioning.cu
-    src/partitioning/round_robin.cu
-    src/quantiles/tdigest/tdigest.cu
-    src/quantiles/quantile.cu
-    src/quantiles/quantiles.cu
-    src/reductions/all.cu
-    src/reductions/any.cu
-    src/reductions/max.cu
-    src/reductions/mean.cu
-    src/reductions/min.cu
-    src/reductions/minmax.cu
-    src/reductions/nth_element.cu
-    src/reductions/product.cu
-    src/reductions/reductions.cpp
-    src/reductions/scan/rank_scan.cu
-    src/reductions/scan/scan.cpp
-    src/reductions/scan/scan_exclusive.cu
-    src/reductions/scan/scan_inclusive.cu
-    src/reductions/std.cu
-    src/reductions/sum.cu
-    src/reductions/sum_of_squares.cu
-    src/reductions/var.cu
-    src/replace/clamp.cu
-    src/replace/nans.cu
-    src/replace/nulls.cu
-    src/replace/replace.cu
-    src/reshape/byte_cast.cu
-    src/reshape/interleave_columns.cu
-    src/reshape/tile.cu
-    src/rolling/grouped_rolling.cu
-    src/rolling/range_window_bounds.cpp
-    src/rolling/rolling.cu
-    src/rolling/rolling_collect_list.cu
-    src/round/round.cu
-    src/scalar/scalar.cpp
-    src/scalar/scalar_factories.cpp
-    src/search/search.cu
-    src/sort/is_sorted.cu
-    src/sort/rank.cu
-    src/sort/segmented_sort.cu
-    src/sort/sort_column.cu
-    src/sort/sort.cu
-    src/sort/stable_sort_column.cu
-    src/sort/stable_sort.cu
-    src/stream_compaction/apply_boolean_mask.cu
-    src/stream_compaction/distinct_count.cu
-    src/stream_compaction/drop_duplicates.cu
-    src/stream_compaction/drop_nans.cu
-    src/stream_compaction/drop_nulls.cu
-    src/strings/attributes.cu
-    src/strings/capitalize.cu
-    src/strings/case.cu
-    src/strings/char_types/char_cases.cu
-    src/strings/char_types/char_types.cu
-    src/strings/combine/concatenate.cu
-    src/strings/combine/join.cu
-    src/strings/combine/join_list_elements.cu
-    src/strings/contains.cu
-    src/strings/convert/convert_booleans.cu
-    src/strings/convert/convert_datetime.cu
-    src/strings/convert/convert_durations.cu
-    src/strings/convert/convert_fixed_point.cu
-    src/strings/convert/convert_floats.cu
-    src/strings/convert/convert_hex.cu
-    src/strings/convert/convert_integers.cu
-    src/strings/convert/convert_ipv4.cu
-    src/strings/convert/convert_urls.cu
-    src/strings/copying/concatenate.cu
-    src/strings/copying/copying.cu
-    src/strings/copying/shift.cu
-    src/strings/extract.cu
-    src/strings/filling/fill.cu
-    src/strings/filter_chars.cu
-    src/strings/findall.cu
-    src/strings/find.cu
-    src/strings/find_multiple.cu
-    src/strings/padding.cu
-    src/strings/json/json_path.cu
-    src/strings/regex/regcomp.cpp
-    src/strings/regex/regexec.cu
-    src/strings/repeat_strings.cu
-    src/strings/replace/backref_re.cu
-    src/strings/replace/multi_re.cu
-    src/strings/replace/replace.cu
-    src/strings/replace/replace_re.cu
-    src/strings/split/partition.cu
-    src/strings/split/split.cu
-    src/strings/split/split_record.cu
-    src/strings/strings_column_factories.cu
-    src/strings/strings_column_view.cu
-    src/strings/strings_scalar_factories.cpp
-    src/strings/strip.cu
-    src/strings/substring.cu
-    src/strings/translate.cu
-    src/strings/utilities.cu
-    src/strings/wrap.cu
-    src/structs/copying/concatenate.cu
-    src/structs/structs_column_factories.cu
-    src/structs/structs_column_view.cpp
-    src/structs/utilities.cpp
-    src/table/table.cpp
-    src/table/table_device_view.cu
-    src/table/table_view.cpp
-    src/text/detokenize.cu
-    src/text/edit_distance.cu
-    src/text/generate_ngrams.cu
-    src/text/ngrams_tokenize.cu
-    src/text/normalize.cu
-    src/text/replace.cu
-    src/text/stemmer.cu
-    src/text/subword/data_normalizer.cu
-    src/text/subword/load_hash_file.cu
-    src/text/subword/subword_tokenize.cu
-    src/text/subword/wordpiece_tokenizer.cu
-    src/text/tokenize.cu
-    src/transform/bools_to_mask.cu
-    src/transform/compute_column.cu
-    src/transform/encode.cu
-    src/transform/mask_to_bools.cu
-    src/transform/nans_to_nulls.cu
-    src/transform/row_bit_count.cu
-    src/transform/transform.cpp
-    src/transpose/transpose.cu
-    src/unary/cast_ops.cu
-    src/unary/math_ops.cu
-    src/unary/nan_ops.cu
-    src/unary/null_ops.cu
-    src/utilities/default_stream.cpp
-    src/utilities/type_checks.cpp
-)
-
-set_target_properties(cudf
-    PROPERTIES BUILD_RPATH                         "\$ORIGIN"
-               INSTALL_RPATH                       "\$ORIGIN"
-               # set target compile options
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       17
-               CUDA_STANDARD_REQUIRED              ON
-               POSITION_INDEPENDENT_CODE           ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
-)
-
-target_compile_options(cudf
-            PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                    "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+# ##################################################################################################
+# * library targets -------------------------------------------------------------------------------
+
+add_library(
+  cudf
+  src/aggregation/aggregation.cpp
+  src/aggregation/aggregation.cu
+  src/aggregation/result_cache.cpp
+  src/ast/expression_parser.cpp
+  src/ast/expressions.cpp
+  src/binaryop/binaryop.cpp
+  src/binaryop/compiled/binary_ops.cu
+  src/binaryop/compiled/Add.cu
+  src/binaryop/compiled/ATan2.cu
+  src/binaryop/compiled/BitwiseAnd.cu
+  src/binaryop/compiled/BitwiseOr.cu
+  src/binaryop/compiled/BitwiseXor.cu
+  src/binaryop/compiled/Less.cu
+  src/binaryop/compiled/Greater.cu
+  src/binaryop/compiled/LessEqual.cu
+  src/binaryop/compiled/GreaterEqual.cu
+  src/binaryop/compiled/Div.cu
+  src/binaryop/compiled/equality_ops.cu
+  src/binaryop/compiled/FloorDiv.cu
+  src/binaryop/compiled/LogBase.cu
+  src/binaryop/compiled/LogicalAnd.cu
+  src/binaryop/compiled/LogicalOr.cu
+  src/binaryop/compiled/Mod.cu
+  src/binaryop/compiled/Mul.cu
+  src/binaryop/compiled/NullMax.cu
+  src/binaryop/compiled/NullMin.cu
+  src/binaryop/compiled/PMod.cu
+  src/binaryop/compiled/Pow.cu
+  src/binaryop/compiled/PyMod.cu
+  src/binaryop/compiled/ShiftLeft.cu
+  src/binaryop/compiled/ShiftRight.cu
+  src/binaryop/compiled/ShiftRightUnsigned.cu
+  src/binaryop/compiled/Sub.cu
+  src/binaryop/compiled/TrueDiv.cu
+  src/binaryop/compiled/util.cpp
+  src/labeling/label_bins.cu
+  src/bitmask/null_mask.cu
+  src/bitmask/is_element_valid.cpp
+  src/column/column.cu
+  src/column/column_device_view.cu
+  src/column/column_factories.cpp
+  src/column/column_factories.cu
+  src/column/column_view.cpp
+  src/comms/ipc/ipc.cpp
+  src/copying/concatenate.cu
+  src/copying/contiguous_split.cu
+  src/copying/copy.cpp
+  src/copying/copy.cu
+  src/copying/copy_range.cu
+  src/copying/gather.cu
+  src/copying/get_element.cu
+  src/copying/pack.cpp
+  src/copying/reverse.cu
+  src/copying/sample.cu
+  src/copying/scatter.cu
+  src/copying/shift.cu
+  src/copying/slice.cu
+  src/copying/split.cpp
+  src/copying/segmented_shift.cu
+  src/datetime/datetime_ops.cu
+  src/dictionary/add_keys.cu
+  src/dictionary/decode.cu
+  src/dictionary/detail/concatenate.cu
+  src/dictionary/detail/merge.cu
+  src/dictionary/dictionary_column_view.cpp
+  src/dictionary/dictionary_factories.cu
+  src/dictionary/encode.cu
+  src/dictionary/remove_keys.cu
+  src/dictionary/replace.cu
+  src/dictionary/search.cu
+  src/dictionary/set_keys.cu
+  src/filling/calendrical_month_sequence.cu
+  src/filling/fill.cu
+  src/filling/repeat.cu
+  src/filling/sequence.cu
+  src/groupby/groupby.cu
+  src/groupby/hash/groupby.cu
+  src/groupby/sort/aggregate.cpp
+  src/groupby/sort/group_argmax.cu
+  src/groupby/sort/group_argmin.cu
+  src/groupby/sort/group_collect.cu
+  src/groupby/sort/group_correlation.cu
+  src/groupby/sort/group_count.cu
+  src/groupby/sort/group_m2.cu
+  src/groupby/sort/group_max.cu
+  src/groupby/sort/group_min.cu
+  src/groupby/sort/group_merge_lists.cu
+  src/groupby/sort/group_merge_m2.cu
+  src/groupby/sort/group_nth_element.cu
+  src/groupby/sort/group_nunique.cu
+  src/groupby/sort/group_product.cu
+  src/groupby/sort/group_quantiles.cu
+  src/groupby/sort/group_std.cu
+  src/groupby/sort/group_sum.cu
+  src/groupby/sort/scan.cpp
+  src/groupby/sort/group_count_scan.cu
+  src/groupby/sort/group_max_scan.cu
+  src/groupby/sort/group_min_scan.cu
+  src/groupby/sort/group_rank_scan.cu
+  src/groupby/sort/group_replace_nulls.cu
+  src/groupby/sort/group_sum_scan.cu
+  src/groupby/sort/group_tdigest.cu
+  src/groupby/sort/sort_helper.cu
+  src/hash/hashing.cu
+  src/hash/md5_hash.cu
+  src/hash/murmur_hash.cu
+  src/interop/dlpack.cpp
+  src/interop/from_arrow.cu
+  src/interop/to_arrow.cu
+  src/interop/detail/arrow_allocator.cpp
+  src/io/avro/avro.cpp
+  src/io/avro/avro_gpu.cu
+  src/io/avro/reader_impl.cu
+  src/io/comp/brotli_dict.cpp
+  src/io/comp/cpu_unbz2.cpp
+  src/io/comp/debrotli.cu
+  src/io/comp/gpuinflate.cu
+  src/io/comp/snap.cu
+  src/io/comp/uncomp.cpp
+  src/io/comp/unsnap.cu
+  src/io/csv/csv_gpu.cu
+  src/io/csv/durations.cu
+  src/io/csv/reader_impl.cu
+  src/io/csv/writer_impl.cu
+  src/io/functions.cpp
+  src/io/json/json_gpu.cu
+  src/io/json/reader_impl.cu
+  src/io/orc/aggregate_orc_metadata.cpp
+  src/io/orc/dict_enc.cu
+  src/io/orc/orc.cpp
+  src/io/orc/reader_impl.cu
+  src/io/orc/stats_enc.cu
+  src/io/orc/stripe_data.cu
+  src/io/orc/stripe_enc.cu
+  src/io/orc/stripe_init.cu
+  src/io/orc/timezone.cpp
+  src/io/orc/writer_impl.cu
+  src/io/parquet/compact_protocol_writer.cpp
+  src/io/parquet/page_data.cu
+  src/io/parquet/chunk_dict.cu
+  src/io/parquet/page_enc.cu
+  src/io/parquet/page_hdr.cu
+  src/io/parquet/parquet.cpp
+  src/io/parquet/reader_impl.cu
+  src/io/parquet/writer_impl.cu
+  src/io/statistics/orc_column_statistics.cu
+  src/io/statistics/parquet_column_statistics.cu
+  src/io/text/multibyte_split.cu
+  src/io/utilities/column_buffer.cpp
+  src/io/utilities/data_sink.cpp
+  src/io/utilities/datasource.cpp
+  src/io/utilities/file_io_utilities.cpp
+  src/io/utilities/parsing_utils.cu
+  src/io/utilities/trie.cu
+  src/io/utilities/type_conversion.cpp
+  src/jit/cache.cpp
+  src/jit/parser.cpp
+  src/jit/type.cpp
+  src/join/conditional_join.cu
+  src/join/cross_join.cu
+  src/join/hash_join.cu
+  src/join/join.cu
+  src/join/join_utils.cu
+  src/join/semi_join.cu
+  src/lists/contains.cu
+  src/lists/combine/concatenate_list_elements.cu
+  src/lists/combine/concatenate_rows.cu
+  src/lists/copying/concatenate.cu
+  src/lists/copying/copying.cu
+  src/lists/copying/gather.cu
+  src/lists/copying/segmented_gather.cu
+  src/lists/copying/scatter_helper.cu
+  src/lists/count_elements.cu
+  src/lists/drop_list_duplicates.cu
+  src/lists/explode.cu
+  src/lists/extract.cu
+  src/lists/interleave_columns.cu
+  src/lists/lists_column_factories.cu
+  src/lists/lists_column_view.cu
+  src/lists/segmented_sort.cu
+  src/merge/merge.cu
+  src/partitioning/partitioning.cu
+  src/partitioning/round_robin.cu
+  src/quantiles/tdigest/tdigest.cu
+  src/quantiles/tdigest/tdigest_column_view.cpp
+  src/quantiles/quantile.cu
+  src/quantiles/quantiles.cu
+  src/reductions/all.cu
+  src/reductions/any.cu
+  src/reductions/max.cu
+  src/reductions/mean.cu
+  src/reductions/min.cu
+  src/reductions/minmax.cu
+  src/reductions/nth_element.cu
+  src/reductions/product.cu
+  src/reductions/reductions.cpp
+  src/reductions/scan/rank_scan.cu
+  src/reductions/scan/scan.cpp
+  src/reductions/scan/scan_exclusive.cu
+  src/reductions/scan/scan_inclusive.cu
+  src/reductions/std.cu
+  src/reductions/sum.cu
+  src/reductions/sum_of_squares.cu
+  src/reductions/var.cu
+  src/replace/clamp.cu
+  src/replace/nans.cu
+  src/replace/nulls.cu
+  src/replace/replace.cu
+  src/reshape/byte_cast.cu
+  src/reshape/interleave_columns.cu
+  src/reshape/tile.cu
+  src/rolling/grouped_rolling.cu
+  src/rolling/range_window_bounds.cpp
+  src/rolling/rolling.cu
+  src/rolling/rolling_collect_list.cu
+  src/round/round.cu
+  src/scalar/scalar.cpp
+  src/scalar/scalar_factories.cpp
+  src/search/search.cu
+  src/sort/is_sorted.cu
+  src/sort/rank.cu
+  src/sort/segmented_sort.cu
+  src/sort/sort_column.cu
+  src/sort/sort.cu
+  src/sort/stable_sort_column.cu
+  src/sort/stable_sort.cu
+  src/stream_compaction/apply_boolean_mask.cu
+  src/stream_compaction/distinct_count.cu
+  src/stream_compaction/drop_duplicates.cu
+  src/stream_compaction/drop_nans.cu
+  src/stream_compaction/drop_nulls.cu
+  src/strings/attributes.cu
+  src/strings/capitalize.cu
+  src/strings/case.cu
+  src/strings/char_types/char_cases.cu
+  src/strings/char_types/char_types.cu
+  src/strings/combine/concatenate.cu
+  src/strings/combine/join.cu
+  src/strings/combine/join_list_elements.cu
+  src/strings/contains.cu
+  src/strings/convert/convert_booleans.cu
+  src/strings/convert/convert_datetime.cu
+  src/strings/convert/convert_durations.cu
+  src/strings/convert/convert_fixed_point.cu
+  src/strings/convert/convert_floats.cu
+  src/strings/convert/convert_hex.cu
+  src/strings/convert/convert_integers.cu
+  src/strings/convert/convert_ipv4.cu
+  src/strings/convert/convert_urls.cu
+  src/strings/convert/convert_lists.cu
+  src/strings/copying/concatenate.cu
+  src/strings/copying/copying.cu
+  src/strings/copying/shift.cu
+  src/strings/extract.cu
+  src/strings/filling/fill.cu
+  src/strings/filter_chars.cu
+  src/strings/findall.cu
+  src/strings/find.cu
+  src/strings/find_multiple.cu
+  src/strings/padding.cu
+  src/strings/json/json_path.cu
+  src/strings/regex/regcomp.cpp
+  src/strings/regex/regexec.cu
+  src/strings/repeat_strings.cu
+  src/strings/replace/backref_re.cu
+  src/strings/replace/multi_re.cu
+  src/strings/replace/replace.cu
+  src/strings/replace/replace_re.cu
+  src/strings/split/partition.cu
+  src/strings/split/split.cu
+  src/strings/split/split_record.cu
+  src/strings/strings_column_factories.cu
+  src/strings/strings_column_view.cpp
+  src/strings/strings_scalar_factories.cpp
+  src/strings/strip.cu
+  src/strings/substring.cu
+  src/strings/translate.cu
+  src/strings/utilities.cu
+  src/strings/wrap.cu
+  src/structs/copying/concatenate.cu
+  src/structs/structs_column_factories.cu
+  src/structs/structs_column_view.cpp
+  src/structs/utilities.cpp
+  src/table/table.cpp
+  src/table/table_device_view.cu
+  src/table/table_view.cpp
+  src/text/detokenize.cu
+  src/text/edit_distance.cu
+  src/text/generate_ngrams.cu
+  src/text/ngrams_tokenize.cu
+  src/text/normalize.cu
+  src/text/replace.cu
+  src/text/stemmer.cu
+  src/text/subword/data_normalizer.cu
+  src/text/subword/load_hash_file.cu
+  src/text/subword/subword_tokenize.cu
+  src/text/subword/wordpiece_tokenizer.cu
+  src/text/tokenize.cu
+  src/transform/bools_to_mask.cu
+  src/transform/compute_column.cu
+  src/transform/encode.cu
+  src/transform/mask_to_bools.cu
+  src/transform/nans_to_nulls.cu
+  src/transform/one_hot_encode.cu
+  src/transform/row_bit_count.cu
+  src/transform/transform.cpp
+  src/transpose/transpose.cu
+  src/unary/cast_ops.cu
+  src/unary/math_ops.cu
+  src/unary/nan_ops.cu
+  src/unary/null_ops.cu
+  src/utilities/default_stream.cpp
+  src/utilities/type_checks.cpp
+)
+
+set_target_properties(
+  cudf
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             POSITION_INDEPENDENT_CODE ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+)
+
+target_compile_options(
+  cudf PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+               "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
 # Specify include paths for the current target and dependents
-target_include_directories(cudf
-           PUBLIC      "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
-                       "$<BUILD_INTERFACE:${JITIFY_INCLUDE_DIR}>"
-                       "$<BUILD_INTERFACE:${LIBCUDACXX_INCLUDE_DIR}>"
-                       "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
-                       "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
-           PRIVATE     "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
-           INTERFACE   "$<INSTALL_INTERFACE:include>"
-                       "$<INSTALL_INTERFACE:include/libcudf/libcudacxx>")
+target_include_directories(
+  cudf
+  PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>"
+         "$<BUILD_INTERFACE:${JITIFY_INCLUDE_DIR}>"
+         "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>"
+         "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
+  PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+  INTERFACE "$<INSTALL_INTERFACE:include>"
+)
 
-target_compile_definitions(cudf
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
-                   "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>>"
+target_compile_definitions(
+  cudf PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
+              "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>>"
 )
 
 # Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79
@@ -486,17 +519,17 @@ target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0")
 
 # Instruct jitify to use the kernel JIT cache
 if(JITIFY_USE_CACHE)
-    target_compile_definitions(cudf PUBLIC JITIFY_USE_CACHE "CUDF_VERSION=${PROJECT_VERSION}")
+  target_compile_definitions(cudf PUBLIC JITIFY_USE_CACHE "CUDF_VERSION=${PROJECT_VERSION}")
 endif()
 
 # Per-thread default stream
 if(PER_THREAD_DEFAULT_STREAM)
-    target_compile_definitions(cudf PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  target_compile_definitions(cudf PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM)
 endif()
 
 # Disable NVTX if necessary
 if(NOT USE_NVTX)
-    target_compile_definitions(cudf PUBLIC NVTX_DISABLE)
+  target_compile_definitions(cudf PUBLIC NVTX_DISABLE)
 endif()
 
 # Define spdlog level
@@ -506,129 +539,128 @@ target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_L
 add_dependencies(cudf jitify_preprocess_run)
 
 # Specify the target module library dependencies
-target_link_libraries(cudf
-           PUBLIC ${ARROW_LIBRARIES}
-                  cudf::Thrust
-                  rmm::rmm
-           PRIVATE cuco::cuco
-                   ZLIB::ZLIB
-                   nvcomp::nvcomp)
+target_link_libraries(
+  cudf
+  PUBLIC ${ARROW_LIBRARIES} libcudacxx::libcudacxx cudf::Thrust rmm::rmm
+  PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+)
 
 # Add Conda library, and include paths if specified
 if(TARGET conda_env)
-    target_link_libraries(cudf PRIVATE conda_env )
+  target_link_libraries(cudf PRIVATE conda_env)
 endif()
 
 # Add cuFile interface if available
 if(TARGET cuFile::cuFile_interface)
-    target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
+  target_link_libraries(cudf PRIVATE cuFile::cuFile_interface)
 endif()
 
 if(CUDA_STATIC_RUNTIME)
-    # Tell CMake what CUDA language runtime to use
-    set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Static)
-    # Make sure to export to consumers what runtime we used
-    target_link_libraries(cudf PUBLIC CUDA::cudart_static)
+  # Tell CMake what CUDA language runtime to use
+  set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+  # Make sure to export to consumers what runtime we used
+  target_link_libraries(cudf PUBLIC CUDA::cudart_static)
 else()
-    # Tell CMake what CUDA language runtime to use
-    set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
-    # Make sure to export to consumers what runtime we used
-    target_link_libraries(cudf PUBLIC CUDA::cudart)
+  # Tell CMake what CUDA language runtime to use
+  set_target_properties(cudf PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+  # Make sure to export to consumers what runtime we used
+  target_link_libraries(cudf PUBLIC CUDA::cudart)
 endif()
 
-# The CUDA::cuda_driver is needed due to JITIFY sources which
-# directly call the cuda driver API
+# The CUDA::cuda_driver is needed due to JITIFY sources which directly call the cuda driver API
 if(NOT TARGET CUDA::cuda_driver)
-    message(FATAL_ERROR "Building libcudf requires `libcuda.so` to be present.
-        This error often occurs when trying to build libcudf from a container without the NVIDIA runtime loaded.")
+  message(
+    FATAL_ERROR
+      "Building libcudf requires `libcuda.so` to be present.
+        This error often occurs when trying to build libcudf from a container without the NVIDIA runtime loaded."
+  )
 endif()
 target_link_libraries(cudf PUBLIC CUDA::cuda_driver)
 
-file(WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
-[=[
+file(
+  WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
+  [=[
 SECTIONS
 {
   .nvFatBinSegment : { *(.nvFatBinSegment) }
   .nv_fatbin : { *(.nv_fatbin) }
 }
-]=])
+]=]
+)
 target_link_options(cudf PRIVATE "$<HOST_LINK:${CUDF_BINARY_DIR}/fatbin.ld>")
 
 add_library(cudf::cudf ALIAS cudf)
 
-###################################################################################################
-# - tests and benchmarks --------------------------------------------------------------------------
-###################################################################################################
+# ##################################################################################################
+# * tests and benchmarks --------------------------------------------------------------------------
+# ##################################################################################################
 
-###################################################################################################
-# - build cudftestutil ----------------------------------------------------------------------------
+# ##################################################################################################
+# * build cudftestutil ----------------------------------------------------------------------------
 
-add_library(cudftestutil STATIC
-            tests/utilities/base_fixture.cpp
-            tests/utilities/column_utilities.cu
-            tests/utilities/table_utilities.cu
-            tests/io/metadata_utilities.cpp
-            tests/strings/utilities.cu)
-
-set_target_properties(cudftestutil
-    PROPERTIES BUILD_RPATH                         "\$ORIGIN"
-               INSTALL_RPATH                       "\$ORIGIN"
-               # set target compile options
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       17
-               CUDA_STANDARD_REQUIRED              ON
-               POSITION_INDEPENDENT_CODE           ON
-               INTERFACE_POSITION_INDEPENDENT_CODE ON
+add_library(
+  cudftestutil STATIC
+  tests/utilities/base_fixture.cpp tests/utilities/column_utilities.cu
+  tests/utilities/table_utilities.cu tests/io/metadata_utilities.cpp tests/strings/utilities.cpp
 )
 
+set_target_properties(
+  cudftestutil
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+             INSTALL_RPATH "\$ORIGIN"
+             # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             CUDA_STANDARD 17
+             CUDA_STANDARD_REQUIRED ON
+             POSITION_INDEPENDENT_CODE ON
+             INTERFACE_POSITION_INDEPENDENT_CODE ON
+)
 
-target_compile_options(cudftestutil
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                   "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
+target_compile_options(
+  cudftestutil PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                      "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
 )
 
-target_link_libraries(cudftestutil
-               PUBLIC GTest::gmock
-                      GTest::gtest
-                      Threads::Threads
-                      cudf)
+target_link_libraries(cudftestutil PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf)
 
-target_include_directories(cudftestutil
-    PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+target_include_directories(
+  cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+                      "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+)
 
 add_library(cudf::cudftestutil ALIAS cudftestutil)
 
-###################################################################################################
-# - add tests -------------------------------------------------------------------------------------
+# ##################################################################################################
+# * add tests -------------------------------------------------------------------------------------
 
 if(CUDF_BUILD_TESTS)
-    # include CTest module -- automatically calls enable_testing()
-    include(CTest)
-    add_subdirectory(tests)
+  # include CTest module -- automatically calls enable_testing()
+  include(CTest)
+  add_subdirectory(tests)
 endif()
 
-###################################################################################################
-# - add benchmarks --------------------------------------------------------------------------------
+# ##################################################################################################
+# * add benchmarks --------------------------------------------------------------------------------
 
 if(CUDF_BUILD_BENCHMARKS)
-    # Find or install GoogleBench
-    rapids_cpm_find(benchmark 1.5.2
-        GIT_REPOSITORY  https://github.com/google/benchmark.git
-        GIT_TAG         v1.5.2
-        GIT_SHALLOW     TRUE
-        OPTIONS         "BENCHMARK_ENABLE_TESTING OFF"
-                        "BENCHMARK_ENABLE_INSTALL OFF")
-
-    # Find or install NVBench
-    include(${rapids-cmake-dir}/cpm/nvbench.cmake)
-    rapids_cpm_nvbench()
-    add_subdirectory(benchmarks)
+  # Find or install GoogleBench
+  rapids_cpm_find(
+    benchmark 1.5.2
+    GIT_REPOSITORY https://github.com/google/benchmark.git
+    GIT_TAG v1.5.2
+    GIT_SHALLOW TRUE
+    OPTIONS "BENCHMARK_ENABLE_TESTING OFF" "BENCHMARK_ENABLE_INSTALL OFF"
+  )
+
+  # Find or install NVBench
+  include(${rapids-cmake-dir}/cpm/nvbench.cmake)
+  rapids_cpm_nvbench()
+  add_subdirectory(benchmarks)
 endif()
 
-###################################################################################################
-# - install targets -------------------------------------------------------------------------------
+# ##################################################################################################
+# * install targets -------------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
 include(CPack)
 include(GNUInstallDirs)
@@ -636,33 +668,34 @@ include(GNUInstallDirs)
 set(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME cudf)
 
 # install target for cudf_base and the proxy libcudf.so
-install(TARGETS cudf
-        DESTINATION ${lib_dir}
-        EXPORT cudf-exports)
-
-install(DIRECTORY
-            ${CUDF_SOURCE_DIR}/include/cudf
-            ${CUDF_SOURCE_DIR}/include/cudf_test
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
-
-install(DIRECTORY
-            ${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx
-            ${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx
-        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/libcudf)
+install(
+  TARGETS cudf
+  DESTINATION ${lib_dir}
+  EXPORT cudf-exports
+)
 
-install(TARGETS cudftestutil
-        DESTINATION ${lib_dir}
-        EXPORT cudf-testing-exports)
+install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cudf_test
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+)
 
-install(EXPORT  cudf-testing-exports
-    FILE        cudf-testing-targets.cmake
-    NAMESPACE   cudf::
-    DESTINATION "${lib_dir}/cmake/cudf")
+install(
+  TARGETS cudftestutil
+  DESTINATION ${lib_dir}
+  EXPORT cudf-testing-exports
+)
 
+install(
+  EXPORT cudf-testing-exports
+  FILE cudf-testing-targets.cmake
+  NAMESPACE cudf::
+  DESTINATION "${lib_dir}/cmake/cudf"
+)
 
 include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-rapids_export_write_dependencies(INSTALL cudf-testing-exports
-    "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake")
+rapids_export_write_dependencies(
+  INSTALL cudf-testing-exports
+  "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
+)
 
 set(doc_string
     [=[
@@ -689,43 +722,17 @@ This module offers an optional testing component which defines the
 following IMPORTED GLOBAL  targets:
 
  cudf::cudftestutil     - The main cudf testing library
-    ]=])
-
+    ]=]
+)
 
 set(common_code_string
     [=[
 if(NOT TARGET cudf::Thrust)
   thrust_create_target(cudf::Thrust FROM_OPTIONS)
 endif()
+]=]
+)
 
-# nvcc automatically adds the CUDA Toolkit system include paths before any
-# system include paths that CMake adds.
-#
-# CMake implicitly treats all includes on import targets as 'SYSTEM' includes.
-#
-# To get the cudacxx shipped with cudf to be picked up by consumers instead of the
-# version shipped with the CUDA Toolkit we need to make sure it is a non-SYSTEM
-# include on the CMake side.
-#
-# To do this currently, we move the includes from the cudf::cudf target to a
-# non-import target to ensure they are `-I` instead of `-isystem`
-
-add_library(cudf_non_system_includes INTERFACE)
-target_link_libraries(cudf::cudf INTERFACE cudf_non_system_includes)
-
-get_target_property(all_includes cudf::cudf INTERFACE_INCLUDE_DIRECTORIES)
-set(system_includes )
-set(normal_includes )
-foreach(include IN LISTS all_includes)
-  if(include MATCHES "/include/libcudf/")
-    list(APPEND normal_includes "${include}")
-  else()
-    list(APPEND system_includes "${include}")
-  endif()
-endforeach()
-set_target_properties(cudf::cudf PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${system_includes}")
-set_target_properties(cudf_non_system_includes PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${normal_includes}")
-]=])
 set(install_code_string
     [=[
 set(ArrowCUDA_DIR "${Arrow_DIR}")
@@ -739,18 +746,21 @@ if(testing IN_LIST cudf_FIND_COMPONENTS)
     include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   endif()
 endif()
-]=])
+]=]
+)
 string(APPEND install_code_string "${common_code_string}")
 
-rapids_export(INSTALL cudf
-    EXPORT_SET cudf-exports
-    GLOBAL_TARGETS cudf
-    NAMESPACE cudf::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK install_code_string)
+rapids_export(
+  INSTALL cudf
+  EXPORT_SET cudf-exports
+  GLOBAL_TARGETS cudf
+  NAMESPACE cudf::
+  DOCUMENTATION doc_string
+  FINAL_CODE_BLOCK install_code_string
+)
 
-################################################################################################
-# - build export -------------------------------------------------------------------------------
+# ##################################################################################################
+# * build export -------------------------------------------------------------------------------
 set(build_code_string
     [=[
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-dependencies.cmake")
@@ -759,29 +769,42 @@ endif()
 if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
   include("${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
 endif()
-]=])
+]=]
+)
 string(APPEND build_code_string "${common_code_string}")
 
-rapids_export(BUILD cudf
-    EXPORT_SET cudf-exports
-    GLOBAL_TARGETS cudf
-    NAMESPACE cudf::
-    DOCUMENTATION doc_string
-    FINAL_CODE_BLOCK build_code_string)
+rapids_export(
+  BUILD cudf
+  EXPORT_SET cudf-exports
+  GLOBAL_TARGETS cudf
+  NAMESPACE cudf::
+  DOCUMENTATION doc_string
+  FINAL_CODE_BLOCK build_code_string
+)
 
-export(EXPORT cudf-testing-exports
-    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
-    NAMESPACE   cudf::)
-rapids_export_write_dependencies(BUILD cudf-testing-exports
-    "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake")
+export(
+  EXPORT cudf-testing-exports
+  FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
+  NAMESPACE cudf::
+)
+rapids_export_write_dependencies(
+  BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
+)
 
-###################################################################################################
-# - make documentation ----------------------------------------------------------------------------
+# ##################################################################################################
+# * make documentation ----------------------------------------------------------------------------
 
 # doc targets for cuDF
-add_custom_command(OUTPUT CUDF_DOXYGEN
-                   WORKING_DIRECTORY ${CUDF_SOURCE_DIR}/doxygen
-                   COMMAND doxygen Doxyfile
-                   VERBATIM)
+add_custom_command(
+  OUTPUT CUDF_DOXYGEN
+  WORKING_DIRECTORY ${CUDF_SOURCE_DIR}/doxygen
+  COMMAND doxygen Doxyfile
+  VERBATIM
+  COMMENT "Custom command for building cudf doxygen docs."
+)
 
-add_custom_target(docs_cudf DEPENDS CUDF_DOXYGEN)
+add_custom_target(
+  docs_cudf
+  DEPENDS CUDF_DOXYGEN
+  COMMENT "Custom command for building cudf doxygen docs."
+)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b3b92003573..fa1e61e26fd 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,226 +1,232 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 find_package(Threads REQUIRED)
 
 add_library(cudf_datagen STATIC common/generate_benchmark_input.cpp)
 target_compile_features(cudf_datagen PUBLIC cxx_std_17 cuda_std_17)
 
-target_compile_options(cudf_datagen
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_compile_options(
+  cudf_datagen PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                      "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
 
-target_link_libraries(cudf_datagen
-               PUBLIC GTest::gmock
-                      GTest::gtest
-                      GTest::gmock_main
-                      GTest::gtest_main
-                      benchmark::benchmark
-                      nvbench::nvbench
-                      Threads::Threads
-                      cudf)
+target_link_libraries(
+  cudf_datagen PUBLIC GTest::gmock GTest::gtest GTest::gmock_main GTest::gtest_main
+                      benchmark::benchmark nvbench::nvbench Threads::Threads cudf
+)
 
-target_include_directories(cudf_datagen
-    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-           "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>")
+target_include_directories(
+  cudf_datagen
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+         "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+)
 
-###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
 
 # Use an OBJECT library so we only compile these helper source files only once
-add_library(cudf_benchmark_common OBJECT
-    "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
-    synchronization/synchronization.cpp
-    io/cuio_benchmark_common.cpp)
+add_library(
+  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
+                               synchronization/synchronization.cpp io/cuio_benchmark_common.cpp
+)
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen)
 
+# This function takes in a benchmark name and benchmark source and handles setting all of the
+# associated properties and linking to build the benchmark
 function(ConfigureBench CMAKE_BENCH_NAME)
-    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
-    set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
-    target_link_libraries(${CMAKE_BENCH_NAME}
-        PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main)
+  add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+  set_target_properties(
+    ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                   "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
+  )
+  target_link_libraries(
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main
+  )
 endfunction()
 
+# This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
+# setting all of the associated properties and linking to build the benchmark
 function(ConfigureNVBench CMAKE_BENCH_NAME)
-    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
-    set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>")
-    target_link_libraries(${CMAKE_BENCH_NAME}
-        PRIVATE cudf_benchmark_common cudf_datagen nvbench::main)
+  add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+  set_target_properties(
+    ${CMAKE_BENCH_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                   "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/benchmarks>"
+  )
+  target_link_libraries(
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::main
+  )
 endfunction()
 
-###################################################################################################
-# - column benchmarks -----------------------------------------------------------------------------
+# ##################################################################################################
+# * column benchmarks -----------------------------------------------------------------------------
 ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate_benchmark.cpp)
 
-###################################################################################################
-# - gather benchmark ------------------------------------------------------------------------------
+# ##################################################################################################
+# * gather benchmark ------------------------------------------------------------------------------
 ConfigureBench(GATHER_BENCH copying/gather_benchmark.cu)
 
-###################################################################################################
-# - scatter benchmark -----------------------------------------------------------------------------
+# ##################################################################################################
+# * scatter benchmark -----------------------------------------------------------------------------
 ConfigureBench(SCATTER_BENCH copying/scatter_benchmark.cu)
 
-###################################################################################################
-# - lists scatter benchmark -----------------------------------------------------------------------
+# ##################################################################################################
+# * lists scatter benchmark -----------------------------------------------------------------------
 ConfigureBench(SCATTER_LISTS_BENCH lists/copying/scatter_lists_benchmark.cu)
 
-###################################################################################################
-# - contiguous_split benchmark  -------------------------------------------------------------------
+# ##################################################################################################
+# * contiguous_split benchmark  -------------------------------------------------------------------
 ConfigureBench(CONTIGUOUS_SPLIT_BENCH copying/contiguous_split_benchmark.cu)
 
-###################################################################################################
-# - shift benchmark -------------------------------------------------------------------------------
+# ##################################################################################################
+# * shift benchmark -------------------------------------------------------------------------------
 ConfigureBench(SHIFT_BENCH copying/shift_benchmark.cu)
 
-###################################################################################################
-# - transpose benchmark ---------------------------------------------------------------------------
+# ##################################################################################################
+# * copy-if-else benchmark
+# -----------------------------------------------------------------------------
+ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else_benchmark.cpp)
+
+# ##################################################################################################
+# * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose_benchmark.cu)
 
-###################################################################################################
-# - apply_boolean_mask benchmark ------------------------------------------------------------------
+# ##################################################################################################
+# * apply_boolean_mask benchmark ------------------------------------------------------------------
 ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask_benchmark.cpp)
 
-###################################################################################################
-# - stream_compaction benchmark -------------------------------------------------------------------
+# ##################################################################################################
+# * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureBench(STREAM_COMPACTION_BENCH stream_compaction/drop_duplicates_benchmark.cpp)
 
-###################################################################################################
-# - join benchmark --------------------------------------------------------------------------------
+# ##################################################################################################
+# * join benchmark --------------------------------------------------------------------------------
 ConfigureBench(JOIN_BENCH join/join_benchmark.cu join/conditional_join_benchmark.cu)
 ConfigureNVBench(JOIN_NVBENCH join/join_nvbench.cu)
 
-###################################################################################################
-# - iterator benchmark ----------------------------------------------------------------------------
+# ##################################################################################################
+# * iterator benchmark ----------------------------------------------------------------------------
 ConfigureBench(ITERATOR_BENCH iterator/iterator_benchmark.cu)
 
-###################################################################################################
-# - search benchmark ------------------------------------------------------------------------------
+# ##################################################################################################
+# * search benchmark ------------------------------------------------------------------------------
 ConfigureBench(SEARCH_BENCH search/search_benchmark.cpp)
 
-###################################################################################################
-# - sort benchmark --------------------------------------------------------------------------------
-ConfigureBench(SORT_BENCH
-  sort/rank_benchmark.cpp
-  sort/sort_benchmark.cpp
-  sort/sort_strings_benchmark.cpp)
+# ##################################################################################################
+# * sort benchmark --------------------------------------------------------------------------------
+ConfigureBench(
+  SORT_BENCH sort/rank_benchmark.cpp sort/sort_benchmark.cpp sort/sort_strings_benchmark.cpp
+)
 
-###################################################################################################
-# - quantiles benchmark --------------------------------------------------------------------------------
-ConfigureBench(QUANTILES_BENCH
-  quantiles/quantiles_benchmark.cpp)
+# ##################################################################################################
+# * quantiles benchmark
+# --------------------------------------------------------------------------------
+ConfigureBench(QUANTILES_BENCH quantiles/quantiles_benchmark.cpp)
 
-###################################################################################################
-# - type_dispatcher benchmark ---------------------------------------------------------------------
+# ##################################################################################################
+# * type_dispatcher benchmark ---------------------------------------------------------------------
 ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher_benchmark.cu)
 
-###################################################################################################
-# - reduction benchmark ---------------------------------------------------------------------------
-ConfigureBench(REDUCTION_BENCH
-  reduction/anyall_benchmark.cpp
-  reduction/dictionary_benchmark.cpp
-  reduction/reduce_benchmark.cpp
-  reduction/scan_benchmark.cpp
-  reduction/minmax_benchmark.cpp)
-
-###################################################################################################
-# - reduction benchmark ---------------------------------------------------------------------------
-ConfigureBench(REPLACE_BENCH
-  replace/clamp_benchmark.cpp)
-
-###################################################################################################
-# - filling benchmark -----------------------------------------------------------------------------
-ConfigureBench(FILL_BENCH
-  filling/repeat_benchmark.cpp)
-
-###################################################################################################
-# - groupby benchmark -----------------------------------------------------------------------------
-ConfigureBench(GROUPBY_BENCH
-  groupby/group_sum_benchmark.cu
-  groupby/group_nth_benchmark.cu
-  groupby/group_shift_benchmark.cu)
-
-###################################################################################################
-# - hashing benchmark -----------------------------------------------------------------------------
-ConfigureBench(HASHING_BENCH
-  hashing/hash_benchmark.cpp
-  hashing/partition_benchmark.cpp)
-
-###################################################################################################
-# - merge benchmark -------------------------------------------------------------------------------
+# ##################################################################################################
+# * reduction benchmark ---------------------------------------------------------------------------
+ConfigureBench(
+  REDUCTION_BENCH reduction/anyall_benchmark.cpp reduction/dictionary_benchmark.cpp
+  reduction/reduce_benchmark.cpp reduction/scan_benchmark.cpp reduction/minmax_benchmark.cpp
+)
+
+# ##################################################################################################
+# * reduction benchmark ---------------------------------------------------------------------------
+ConfigureBench(REPLACE_BENCH replace/clamp_benchmark.cpp replace/nans_benchmark.cpp)
+
+# ##################################################################################################
+# * filling benchmark -----------------------------------------------------------------------------
+ConfigureBench(FILL_BENCH filling/repeat_benchmark.cpp)
+
+# ##################################################################################################
+# * groupby benchmark -----------------------------------------------------------------------------
+ConfigureBench(
+  GROUPBY_BENCH groupby/group_sum_benchmark.cu groupby/group_nth_benchmark.cu
+  groupby/group_shift_benchmark.cu groupby/group_struct_benchmark.cu
+)
+
+# ##################################################################################################
+# * hashing benchmark -----------------------------------------------------------------------------
+ConfigureBench(HASHING_BENCH hashing/hash_benchmark.cpp hashing/partition_benchmark.cpp)
+
+# ##################################################################################################
+# * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge_benchmark.cpp)
 
-###################################################################################################
-# - null_mask benchmark ---------------------------------------------------------------------------
+# ##################################################################################################
+# * null_mask benchmark ---------------------------------------------------------------------------
 ConfigureBench(NULLMASK_BENCH null_mask/set_null_mask_benchmark.cpp)
 
-###################################################################################################
-# - parquet writer chunks benchmark ---------------------------------------------------------------
+# ##################################################################################################
+# * parquet writer chunks benchmark ---------------------------------------------------------------
 ConfigureBench(PARQUET_WRITER_CHUNKS_BENCH io/parquet/parquet_writer_chunks_benchmark.cpp)
 
-###################################################################################################
-# - parquet reader benchmark ----------------------------------------------------------------------
+# ##################################################################################################
+# * parquet reader benchmark ----------------------------------------------------------------------
 ConfigureBench(PARQUET_READER_BENCH io/parquet/parquet_reader_benchmark.cpp)
 
-###################################################################################################
-# - orc reader benchmark --------------------------------------------------------------------------
+# ##################################################################################################
+# * orc reader benchmark --------------------------------------------------------------------------
 ConfigureBench(ORC_READER_BENCH io/orc/orc_reader_benchmark.cpp)
 
-###################################################################################################
-# - csv reader benchmark --------------------------------------------------------------------------
+# ##################################################################################################
+# * csv reader benchmark --------------------------------------------------------------------------
 ConfigureBench(CSV_READER_BENCH io/csv/csv_reader_benchmark.cpp)
 
-###################################################################################################
-# - parquet writer benchmark ----------------------------------------------------------------------
+# ##################################################################################################
+# * parquet writer benchmark ----------------------------------------------------------------------
 ConfigureBench(PARQUET_WRITER_BENCH io/parquet/parquet_writer_benchmark.cpp)
 
-###################################################################################################
-# - orc writer benchmark --------------------------------------------------------------------------
+# ##################################################################################################
+# * orc writer benchmark --------------------------------------------------------------------------
 ConfigureBench(ORC_WRITER_BENCH io/orc/orc_writer_benchmark.cpp)
 
-###################################################################################################
-# - csv writer benchmark --------------------------------------------------------------------------
+# ##################################################################################################
+# * csv writer benchmark --------------------------------------------------------------------------
 ConfigureBench(CSV_WRITER_BENCH io/csv/csv_writer_benchmark.cpp)
 
-###################################################################################################
-# - ast benchmark ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * ast benchmark ---------------------------------------------------------------------------------
 ConfigureBench(AST_BENCH ast/transform_benchmark.cpp)
 
-###################################################################################################
-# - binaryop benchmark ----------------------------------------------------------------------------
-ConfigureBench(BINARYOP_BENCH
-  binaryop/binaryop_benchmark.cpp
-  binaryop/compiled_binaryop_benchmark.cpp
-  binaryop/jit_binaryop_benchmark.cpp)
-
-###################################################################################################
-# - nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH
+# ##################################################################################################
+# * binaryop benchmark ----------------------------------------------------------------------------
+ConfigureBench(
+  BINARYOP_BENCH binaryop/binaryop_benchmark.cpp binaryop/compiled_binaryop_benchmark.cpp
+  binaryop/jit_binaryop_benchmark.cpp
+)
+
+# ##################################################################################################
+# * nvtext benchmark -------------------------------------------------------------------
+ConfigureBench(
+  TEXT_BENCH
   text/ngrams_benchmark.cpp
   text/normalize_benchmark.cpp
   text/normalize_spaces_benchmark.cpp
   text/replace_benchmark.cpp
   text/subword_benchmark.cpp
-  text/tokenize_benchmark.cpp)
+  text/tokenize_benchmark.cpp
+)
 
-###################################################################################################
-# - strings benchmark -------------------------------------------------------------------
-ConfigureBench(STRINGS_BENCH
+# ##################################################################################################
+# * strings benchmark -------------------------------------------------------------------
+ConfigureBench(
+  STRINGS_BENCH
   string/case_benchmark.cpp
   string/combine_benchmark.cpp
   string/contains_benchmark.cpp
@@ -239,14 +245,13 @@ ConfigureBench(STRINGS_BENCH
   string/split_benchmark.cpp
   string/substring_benchmark.cpp
   string/translate_benchmark.cpp
-  string/url_decode_benchmark.cpp)
+  string/url_decode_benchmark.cpp
+)
 
-###################################################################################################
-# - json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH
-  string/json_benchmark.cpp)
+# ##################################################################################################
+# * json benchmark -------------------------------------------------------------------
+ConfigureBench(JSON_BENCH string/json_benchmark.cpp)
 
-###################################################################################################
-# - io benchmark ---------------------------------------------------------------------
-ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK
-  io/text/multibyte_split_benchmark.cpp)
+# ##################################################################################################
+# * io benchmark ---------------------------------------------------------------------
+ConfigureBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split_benchmark.cpp)
diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp
index ba2bc245484..0ec2590bdb5 100644
--- a/cpp/benchmarks/common/generate_benchmark_input.cpp
+++ b/cpp/benchmarks/common/generate_benchmark_input.cpp
@@ -297,12 +297,21 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
     }
   }
 
+  // cudf expects the null mask buffer to be padded up to 64 bytes. so allocate the proper size and
+  // copy what we have.
+  rmm::device_buffer result_bitmask{cudf::bitmask_allocation_size_bytes(num_rows),
+                                    rmm::cuda_stream_default};
+  cudaMemcpyAsync(result_bitmask.data(),
+                  null_mask.data(),
+                  null_mask.size() * sizeof(cudf::bitmask_type),
+                  cudaMemcpyHostToDevice,
+                  rmm::cuda_stream_default);
+
   return std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_to_id<T>()},
     num_rows,
     rmm::device_buffer(data.data(), num_rows * sizeof(stored_Type), rmm::cuda_stream_default),
-    rmm::device_buffer(
-      null_mask.data(), null_mask.size() * sizeof(cudf::bitmask_type), rmm::cuda_stream_default));
+    std::move(result_bitmask));
 }
 
 /**
diff --git a/cpp/benchmarks/copying/copy_if_else_benchmark.cpp b/cpp/benchmarks/copying/copy_if_else_benchmark.cpp
new file mode 100644
index 00000000000..513e4f4c179
--- /dev/null
+++ b/cpp/benchmarks/copying/copy_if_else_benchmark.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/copying.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+class CopyIfElse : public cudf::benchmark {
+};
+
+template <class TypeParam>
+static void BM_copy_if_else(benchmark::State& state, bool nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto input_type  = cudf::type_to_id<TypeParam>();
+  auto bool_type   = cudf::type_id::BOOL8;
+  auto const input = create_random_table({input_type, input_type, bool_type}, 3, row_count{n_rows});
+
+  if (!nulls) {
+    input->get_column(2).set_null_mask(rmm::device_buffer{}, 0);
+    input->get_column(1).set_null_mask(rmm::device_buffer{}, 0);
+    input->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
+  }
+
+  cudf::column_view decision(input->view().column(2));
+  cudf::column_view rhs(input->view().column(1));
+  cudf::column_view lhs(input->view().column(0));
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true, rmm::cuda_stream_default);
+    cudf::copy_if_else(lhs, rhs, decision);
+  }
+}
+
+#define COPY_BENCHMARK_DEFINE(name, type, b)                  \
+  BENCHMARK_DEFINE_F(CopyIfElse, name)                        \
+  (::benchmark::State & st) { BM_copy_if_else<type>(st, b); } \
+  BENCHMARK_REGISTER_F(CopyIfElse, name)                      \
+    ->RangeMultiplier(8)                                      \
+    ->Ranges({{1 << 12, 1 << 27}})                            \
+    ->UseManualTime()                                         \
+    ->Unit(benchmark::kMillisecond);
+
+COPY_BENCHMARK_DEFINE(int16, int16_t, true)
+COPY_BENCHMARK_DEFINE(uint32, uint32_t, true)
+COPY_BENCHMARK_DEFINE(float64, double, true)
+COPY_BENCHMARK_DEFINE(int16_no_nulls, int16_t, false)
+COPY_BENCHMARK_DEFINE(uint32_no_nulls, uint32_t, false)
+COPY_BENCHMARK_DEFINE(float64_no_nulls, double, false)
diff --git a/cpp/benchmarks/groupby/group_struct_benchmark.cu b/cpp/benchmarks/groupby/group_struct_benchmark.cu
new file mode 100644
index 00000000000..702983a63bf
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_struct_benchmark.cu
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table.hpp>
+
+#include <thrust/iterator/transform_iterator.h>
+
+static constexpr cudf::size_type num_struct_members = 8;
+static constexpr cudf::size_type max_int            = 100;
+static constexpr cudf::size_type max_str_length     = 32;
+
+static auto create_data_table(cudf::size_type n_rows)
+{
+  data_profile table_profile;
+  table_profile.set_distribution_params(cudf::type_id::INT32, distribution_id::UNIFORM, 0, max_int);
+  table_profile.set_distribution_params(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
+
+  // The first two struct members are int32 and string.
+  // The first column is also used as keys in groupby.
+  auto col_ids = std::vector<cudf::type_id>{cudf::type_id::INT32, cudf::type_id::STRING};
+
+  // The subsequent struct members are int32 and string again.
+  for (cudf::size_type i = 3; i <= num_struct_members; ++i) {
+    if (i % 2) {
+      col_ids.push_back(cudf::type_id::INT32);
+    } else {
+      col_ids.push_back(cudf::type_id::STRING);
+    }
+  }
+
+  return create_random_table(col_ids, num_struct_members, row_count{n_rows}, table_profile);
+}
+
+// Max aggregation/scan technically has the same performance as min.
+template <typename OpType>
+void BM_groupby_min_struct(benchmark::State& state)
+{
+  auto const n_rows = static_cast<cudf::size_type>(state.range(0));
+  auto data_cols    = create_data_table(n_rows)->release();
+
+  auto const keys_view = data_cols.front()->view();
+  auto const values =
+    cudf::make_structs_column(keys_view.size(), std::move(data_cols), 0, rmm::device_buffer());
+
+  using RequestType = std::conditional_t<std::is_same_v<OpType, cudf::groupby_aggregation>,
+                                         cudf::groupby::aggregation_request,
+                                         cudf::groupby::scan_request>;
+
+  auto gb_obj   = cudf::groupby::groupby(cudf::table_view({keys_view}));
+  auto requests = std::vector<RequestType>();
+  requests.emplace_back(RequestType());
+  requests.front().values = values->view();
+  requests.front().aggregations.push_back(cudf::make_min_aggregation<OpType>());
+
+  for (auto _ : state) {
+    [[maybe_unused]] auto const timer = cuda_event_timer(state, true);
+    if constexpr (std::is_same_v<OpType, cudf::groupby_aggregation>) {
+      [[maybe_unused]] auto const result = gb_obj.aggregate(requests);
+    } else {
+      [[maybe_unused]] auto const result = gb_obj.scan(requests);
+    }
+  }
+}
+
+class Groupby : public cudf::benchmark {
+};
+
+#define MIN_RANGE 10'000
+#define MAX_RANGE 10'000'000
+
+#define REGISTER_BENCHMARK(name, op_type)                       \
+  BENCHMARK_DEFINE_F(Groupby, name)(::benchmark::State & state) \
+  {                                                             \
+    BM_groupby_min_struct<op_type>(state);                      \
+  }                                                             \
+  BENCHMARK_REGISTER_F(Groupby, name)                           \
+    ->UseManualTime()                                           \
+    ->Unit(benchmark::kMillisecond)                             \
+    ->RangeMultiplier(4)                                        \
+    ->Ranges({{MIN_RANGE, MAX_RANGE}});
+
+REGISTER_BENCHMARK(Aggregation, cudf::groupby_aggregation)
+REGISTER_BENCHMARK(Scan, cudf::groupby_scan_aggregation)
diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu
index 141e726027b..bf078ff51eb 100644
--- a/cpp/benchmarks/join/conditional_join_benchmark.cu
+++ b/cpp/benchmarks/join/conditional_join_benchmark.cu
@@ -148,6 +148,7 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
@@ -155,6 +156,7 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
@@ -162,6 +164,7 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
@@ -169,6 +172,7 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index d7f64716e58..e846317f472 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -41,17 +41,12 @@ __global__ static void init_curand(curandState* state, const int nstates)
 template <typename key_type, typename size_type>
 __global__ static void init_build_tbl(key_type* const build_tbl,
                                       const size_type build_tbl_size,
-                                      const key_type rand_max,
-                                      const bool uniq_build_tbl_keys,
-                                      key_type* const lottery,
-                                      const size_type lottery_size,
+                                      const int multiplicity,
                                       curandState* state,
                                       const int num_states)
 {
-  static_assert(std::is_signed<key_type>::value, "key_type needs to be signed for lottery to work");
-
-  const int start_idx   = blockIdx.x * blockDim.x + threadIdx.x;
-  const key_type stride = blockDim.x * gridDim.x;
+  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const stride    = blockDim.x * gridDim.x;
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
@@ -59,28 +54,7 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
   for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
     const double x = curand_uniform_double(&localState);
 
-    if (uniq_build_tbl_keys) {
-      // If the build table keys need to be unique, go through lottery array from lottery_idx until
-      // finding a key which has not been used (-1). Mark the key as been used by atomically setting
-      // the spot to -1.
-
-      size_type lottery_idx = x * lottery_size;
-      key_type lottery_val  = -1;
-
-      while (-1 == lottery_val) {
-        lottery_val = lottery[lottery_idx];
-
-        if (-1 != lottery_val) {
-          lottery_val = atomicCAS<key_type>(lottery + lottery_idx, lottery_val, -1);
-        }
-
-        lottery_idx = (lottery_idx + 1) % lottery_size;
-      }
-
-      build_tbl[idx] = lottery_val;
-    } else {
-      build_tbl[idx] = x * rand_max;
-    }
+    build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
   }
 
   state[start_idx] = localState;
@@ -89,16 +63,15 @@ __global__ static void init_build_tbl(key_type* const build_tbl,
 template <typename key_type, typename size_type>
 __global__ void init_probe_tbl(key_type* const probe_tbl,
                                const size_type probe_tbl_size,
-                               const key_type* const build_tbl,
                                const size_type build_tbl_size,
-                               const key_type* const lottery,
-                               const size_type lottery_size,
+                               const key_type rand_max,
                                const double selectivity,
+                               const int multiplicity,
                                curandState* state,
                                const int num_states)
 {
-  const int start_idx    = blockIdx.x * blockDim.x + threadIdx.x;
-  const size_type stride = blockDim.x * gridDim.x;
+  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const stride    = blockDim.x * gridDim.x;
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
@@ -109,21 +82,15 @@ __global__ void init_probe_tbl(key_type* const probe_tbl,
 
     if (x <= selectivity) {
       // x <= selectivity means this key in the probe table should be present in the build table, so
-      // we pick a key from build_tbl
-      x                       = curand_uniform_double(&localState);
-      size_type build_tbl_idx = x * build_tbl_size;
-
-      if (build_tbl_idx >= build_tbl_size) { build_tbl_idx = build_tbl_size - 1; }
-
-      val = build_tbl[build_tbl_idx];
+      // we pick a key from [0, build_tbl_size / multiplicity]
+      x   = curand_uniform_double(&localState);
+      val = static_cast<key_type>(x * (build_tbl_size / multiplicity));
     } else {
       // This key in the probe table should not be present in the build table, so we pick a key from
-      // lottery.
-      x                     = curand_uniform_double(&localState);
-      size_type lottery_idx = x * lottery_size;
-      val                   = lottery[lottery_idx];
+      // [build_tbl_size, rand_max].
+      x   = curand_uniform_double(&localState);
+      val = static_cast<key_type>(x * (rand_max - build_tbl_size) + build_tbl_size);
     }
-
     probe_tbl[idx] = val;
   }
 
@@ -152,9 +119,7 @@ __global__ void init_probe_tbl(key_type* const probe_tbl,
  * @param[in] build_tbl_size        number of keys in the build table
  * @param[in] selectivity           probability with which an element of the probe table is
  *                                  present in the build table.
- * @param[in] rand_max              maximum random number to generate. I.e. random numbers are
- *                                  integers from [0,rand_max].
- * @param[in] uniq_build_tbl_keys   if each key in the build table should appear exactly once.
+ * @param[in] multiplicity          number of matches for each key.
  */
 template <typename key_type, typename size_type>
 void generate_input_tables(key_type* const build_tbl,
@@ -162,8 +127,7 @@ void generate_input_tables(key_type* const build_tbl,
                            key_type* const probe_tbl,
                            const size_type probe_tbl_size,
                            const double selectivity,
-                           const key_type rand_max,
-                           const bool uniq_build_tbl_keys)
+                           const int multiplicity)
 {
   // With large values of rand_max the a lot of temporary storage is needed for the lottery. At the
   // expense of not being that accurate with applying the selectivity an especially more memory
@@ -171,9 +135,7 @@ void generate_input_tables(key_type* const build_tbl,
   // let one table choose random numbers from only one interval and the other only select with
   // selective probability from the same interval and from the other in the other cases.
 
-  static_assert(std::is_signed<key_type>::value, "key_type needs to be signed for lottery to work");
-
-  const int block_size = 128;
+  constexpr int block_size = 128;
 
   // Maximize exposed parallelism while minimizing storage for curand state
   int num_blocks_init_build_tbl{-1};
@@ -198,55 +160,20 @@ void generate_input_tables(key_type* const build_tbl,
 
   CHECK_CUDA(0);
 
-  size_type lottery_size =
-    rand_max < std::numeric_limits<key_type>::max() - 1 ? rand_max + 1 : rand_max;
-  rmm::device_uvector<key_type> lottery(lottery_size, rmm::cuda_stream_default);
-
-  if (uniq_build_tbl_keys) {
-    thrust::sequence(rmm::exec_policy(), lottery.begin(), lottery.end(), 0);
-  }
-
-  init_build_tbl<key_type, size_type>
-    <<<num_sms * num_blocks_init_build_tbl, block_size>>>(build_tbl,
-                                                          build_tbl_size,
-                                                          rand_max,
-                                                          uniq_build_tbl_keys,
-                                                          lottery.data(),
-                                                          lottery_size,
-                                                          devStates.data(),
-                                                          num_states);
+  init_build_tbl<key_type, size_type><<<num_sms * num_blocks_init_build_tbl, block_size>>>(
+    build_tbl, build_tbl_size, multiplicity, devStates.data(), num_states);
 
   CHECK_CUDA(0);
 
-  rmm::device_uvector<key_type> build_tbl_sorted(build_tbl_size, rmm::cuda_stream_default);
-
-  CUDA_TRY(cudaMemcpy(build_tbl_sorted.data(),
-                      build_tbl,
-                      build_tbl_size * sizeof(key_type),
-                      cudaMemcpyDeviceToDevice));
-
-  thrust::sort(rmm::exec_policy(), build_tbl_sorted.begin(), build_tbl_sorted.end());
-
-  // Exclude keys used in build table from lottery
-  thrust::counting_iterator<key_type> first_lottery_elem(0);
-  thrust::counting_iterator<key_type> last_lottery_elem = first_lottery_elem + lottery_size;
-  key_type* lottery_end                                 = thrust::set_difference(rmm::exec_policy(),
-                                                 first_lottery_elem,
-                                                 last_lottery_elem,
-                                                 build_tbl_sorted.begin(),
-                                                 build_tbl_sorted.end(),
-                                                 lottery.data());
-
-  lottery_size = thrust::distance(lottery.data(), lottery_end);
+  auto const rand_max = std::numeric_limits<key_type>::max();
 
   init_probe_tbl<key_type, size_type>
     <<<num_sms * num_blocks_init_build_tbl, block_size>>>(probe_tbl,
                                                           probe_tbl_size,
-                                                          build_tbl,
                                                           build_tbl_size,
-                                                          lottery.data(),
-                                                          lottery_size,
+                                                          rand_max,
                                                           selectivity,
+                                                          multiplicity,
                                                           devStates.data(),
                                                           num_states);
 
diff --git a/cpp/benchmarks/join/join_benchmark_common.hpp b/cpp/benchmarks/join/join_benchmark_common.hpp
index add87bf7dfb..e88253395d8 100644
--- a/cpp/benchmarks/join/join_benchmark_common.hpp
+++ b/cpp/benchmarks/join/join_benchmark_common.hpp
@@ -60,14 +60,13 @@ static void BM_join(state_type& state, Join JoinFunc)
     }
   }();
 
-  const cudf::size_type rand_max_val{build_table_size * 2};
-  const double selectivity             = 0.3;
-  const bool is_build_table_key_unique = true;
+  const double selectivity = 0.3;
+  const int multiplicity   = 1;
 
   // Generate build and probe tables
   cudf::test::UniformRandomGenerator<cudf::size_type> rand_gen(0, build_table_size);
   auto build_random_null_mask = [&rand_gen](int size) {
-    // roughly 25% nulls
+    // roughly 75% nulls
     auto validity = thrust::make_transform_iterator(
       thrust::make_counting_iterator(0),
       [&rand_gen](auto i) { return (rand_gen.generate() & 3) == 0; });
@@ -95,8 +94,7 @@ static void BM_join(state_type& state, Join JoinFunc)
     probe_key_column->mutable_view().data<key_type>(),
     probe_table_size,
     selectivity,
-    rand_max_val,
-    is_build_table_key_unique);
+    multiplicity);
 
   auto payload_data_it = thrust::make_counting_iterator(0);
   cudf::test::fixed_width_column_wrapper<payload_type> build_payload_column(
@@ -125,12 +123,12 @@ static void BM_join(state_type& state, Join JoinFunc)
   if constexpr (std::is_same_v<state_type, nvbench::state> and (not is_conditional)) {
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       rmm::cuda_stream_view stream_view{launch.get_stream()};
-      JoinFunc(probe_table,
-               build_table,
-               columns_to_join,
-               columns_to_join,
-               cudf::null_equality::UNEQUAL,
-               stream_view);
+      auto result = JoinFunc(probe_table,
+                             build_table,
+                             columns_to_join,
+                             columns_to_join,
+                             cudf::null_equality::UNEQUAL,
+                             stream_view);
     });
   }
 
diff --git a/cpp/benchmarks/replace/clamp_benchmark.cpp b/cpp/benchmarks/replace/clamp_benchmark.cpp
index f897b9d82cc..4d9da4aca6d 100644
--- a/cpp/benchmarks/replace/clamp_benchmark.cpp
+++ b/cpp/benchmarks/replace/clamp_benchmark.cpp
@@ -30,7 +30,7 @@ class ReplaceClamp : public cudf::benchmark {
 };
 
 template <typename type>
-static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+static void BM_clamp(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype = cudf::type_to_id<type>();
@@ -58,15 +58,15 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   }
 }
 
-#define CLAMP_BENCHMARK_DEFINE(name, type, nulls)                         \
-  BENCHMARK_DEFINE_F(ReplaceClamp, name)                                  \
-  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
-  BENCHMARK_REGISTER_F(ReplaceClamp, name)                                \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
+#define CLAMP_BENCHMARK_DEFINE(name, type, nulls)                \
+  BENCHMARK_DEFINE_F(ReplaceClamp, name)                         \
+  (::benchmark::State & state) { BM_clamp<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReplaceClamp, name)                       \
+    ->UseManualTime()                                            \
+    ->Arg(10000)      /* 10k */                                  \
+    ->Arg(100000)     /* 100k */                                 \
+    ->Arg(1000000)    /* 1M */                                   \
+    ->Arg(10000000)   /* 10M */                                  \
     ->Arg(100000000); /* 100M */
 
 CLAMP_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
diff --git a/cpp/benchmarks/replace/nans_benchmark.cpp b/cpp/benchmarks/replace/nans_benchmark.cpp
new file mode 100644
index 00000000000..a337ae5e7ad
--- /dev/null
+++ b/cpp/benchmarks/replace/nans_benchmark.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <benchmarks/common/generate_benchmark_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+class ReplaceNans : public cudf::benchmark {
+};
+
+template <typename type>
+static void BM_replace_nans(benchmark::State& state, bool include_nulls)
+{
+  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
+  auto const dtype = cudf::type_to_id<type>();
+  auto const table = create_random_table({dtype}, 1, row_count{n_rows});
+  if (!include_nulls) { table->get_column(0).set_null_mask(rmm::device_buffer{}, 0); }
+  cudf::column_view input(table->view().column(0));
+
+  auto zero = cudf::make_fixed_width_scalar<type>(0);
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = cudf::replace_nans(input, *zero);
+  }
+}
+
+#define NANS_BENCHMARK_DEFINE(name, type, nulls)                        \
+  BENCHMARK_DEFINE_F(ReplaceNans, name)                                 \
+  (::benchmark::State & state) { BM_replace_nans<type>(state, nulls); } \
+  BENCHMARK_REGISTER_F(ReplaceNans, name)                               \
+    ->UseManualTime()                                                   \
+    ->Arg(10000)      /* 10k */                                         \
+    ->Arg(100000)     /* 100k */                                        \
+    ->Arg(1000000)    /* 1M */                                          \
+    ->Arg(10000000)   /* 10M */                                         \
+    ->Arg(100000000); /* 100M */
+
+NANS_BENCHMARK_DEFINE(float32_nulls, float, true);
+NANS_BENCHMARK_DEFINE(float64_nulls, double, true);
+NANS_BENCHMARK_DEFINE(float32_no_nulls, float, false);
+NANS_BENCHMARK_DEFINE(float64_no_nulls, double, false);
diff --git a/cpp/benchmarks/string/extract_benchmark.cpp b/cpp/benchmarks/string/extract_benchmark.cpp
index 161e30c6f25..7ed083d9571 100644
--- a/cpp/benchmarks/string/extract_benchmark.cpp
+++ b/cpp/benchmarks/string/extract_benchmark.cpp
@@ -47,8 +47,8 @@ static void BM_extract(benchmark::State& state, int groups)
     return row;
   });
 
-  std::string pattern;
-  while (static_cast<int>(pattern.size()) < groups) {
+  std::string pattern{""};
+  while (groups--) {
     pattern += "(\\d+) ";
   }
 
@@ -86,6 +86,7 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
     ->UseManualTime()                                         \
     ->Unit(benchmark::kMillisecond);
 
-STRINGS_BENCHMARK_DEFINE(small, 2)
-STRINGS_BENCHMARK_DEFINE(medium, 10)
-STRINGS_BENCHMARK_DEFINE(large, 30)
+STRINGS_BENCHMARK_DEFINE(one, 1)
+STRINGS_BENCHMARK_DEFINE(two, 2)
+STRINGS_BENCHMARK_DEFINE(four, 4)
+STRINGS_BENCHMARK_DEFINE(eight, 8)
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index a537b5e3beb..198435e739d 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -1,21 +1,19 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 if(CMAKE_COMPILER_IS_GNUCXX)
-    list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
+  list(APPEND CUDF_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations)
 endif()
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
@@ -25,20 +23,21 @@ list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
 list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
 if(DISABLE_DEPRECATION_WARNING)
-    list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
-    list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
+  list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
+  list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
 endif()
 
 # make sure we produce smallest binary size
 list(APPEND CUDF_CUDA_FLAGS -Xfatbin=-compress-all)
 
-# Option to enable line info in CUDA device compilation to allow introspection when profiling / memchecking
+# Option to enable line info in CUDA device compilation to allow introspection when profiling /
+# memchecking
 if(CUDA_ENABLE_LINEINFO)
-    list(APPEND CUDF_CUDA_FLAGS -lineinfo)
+  list(APPEND CUDF_CUDA_FLAGS -lineinfo)
 endif()
 
 # Debug options
 if(CMAKE_BUILD_TYPE MATCHES Debug)
-    message(VERBOSE "CUDF: Building with debugging flags")
-    list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-rdynamic)
+  message(VERBOSE "CUDF: Building with debugging flags")
+  list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-rdynamic)
 endif()
diff --git a/cpp/cmake/Modules/FindcuFile.cmake b/cpp/cmake/Modules/FindcuFile.cmake
index 880ad773369..e539a6604a8 100644
--- a/cpp/cmake/Modules/FindcuFile.cmake
+++ b/cpp/cmake/Modules/FindcuFile.cmake
@@ -1,17 +1,15 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
 
 #[=======================================================================[.rst:
 FindcuFile
@@ -48,81 +46,73 @@ This will define the following variables in your project:
 
 #]=======================================================================]
 
-
-# use pkg-config to get the directories and then use these values
-# in the FIND_PATH() and FIND_LIBRARY() calls
+# use pkg-config to get the directories and then use these values in the FIND_PATH() and
+# FIND_LIBRARY() calls
 find_package(PkgConfig QUIET)
 pkg_check_modules(PKG_cuFile QUIET cuFile)
 
 set(cuFile_COMPILE_OPTIONS ${PKG_cuFile_CFLAGS_OTHER})
 set(cuFile_VERSION ${PKG_cuFile_VERSION})
 
-find_path(cuFile_INCLUDE_DIR
-  NAMES
-    cufile.h
-  HINTS
-    ${PKG_cuFile_INCLUDE_DIRS}
-    /usr/local/cuda/include
-    /usr/local/cuda/lib64
+find_path(
+  cuFile_INCLUDE_DIR
+  NAMES cufile.h
+  HINTS ${PKG_cuFile_INCLUDE_DIRS} /usr/local/cuda/include /usr/local/cuda/lib64
 )
 
-find_library(cuFile_LIBRARY
-  NAMES
-    cufile
-  HINTS
-    ${PKG_cuFile_LIBRARY_DIRS}
-    /usr/local/cuda/lib64
+find_library(
+  cuFile_LIBRARY
+  NAMES cufile
+  HINTS ${PKG_cuFile_LIBRARY_DIRS} /usr/local/cuda/lib64
 )
 
-find_library(cuFileRDMA_LIBRARY
-  NAMES
-    cufile_rdma
-  HINTS
-    ${PKG_cuFile_LIBRARY_DIRS}
-    /usr/local/cuda/lib64
+find_library(
+  cuFileRDMA_LIBRARY
+  NAMES cufile_rdma
+  HINTS ${PKG_cuFile_LIBRARY_DIRS} /usr/local/cuda/lib64
 )
 
 include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(cuFile
-  FOUND_VAR
-    cuFile_FOUND
-  REQUIRED_VARS
-    cuFile_LIBRARY
-    cuFileRDMA_LIBRARY
-    cuFile_INCLUDE_DIR
-  VERSION_VAR
-    cuFile_VERSION
+find_package_handle_standard_args(
+  cuFile
+  FOUND_VAR cuFile_FOUND
+  REQUIRED_VARS cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR
+  VERSION_VAR cuFile_VERSION
 )
 
-if (cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
+if(cuFile_INCLUDE_DIR AND NOT TARGET cuFile::cuFile_interface)
   add_library(cuFile::cuFile_interface IMPORTED INTERFACE)
-  target_include_directories(cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>")
+  target_include_directories(
+    cuFile::cuFile_interface INTERFACE "$<BUILD_INTERFACE:${cuFile_INCLUDE_DIR}>"
+  )
   target_compile_options(cuFile::cuFile_interface INTERFACE "${cuFile_COMPILE_OPTIONS}")
   target_compile_definitions(cuFile::cuFile_interface INTERFACE CUFILE_FOUND)
-endif ()
+endif()
 
-if (cuFile_FOUND AND NOT TARGET cuFile::cuFile)
+if(cuFile_FOUND AND NOT TARGET cuFile::cuFile)
   add_library(cuFile::cuFile UNKNOWN IMPORTED)
-  set_target_properties(cuFile::cuFile PROPERTIES
-    IMPORTED_LOCATION "${cuFile_LIBRARY}"
-    INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
-    INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
+  set_target_properties(
+    cuFile::cuFile
+    PROPERTIES IMPORTED_LOCATION "${cuFile_LIBRARY}"
+               INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
+               INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
   )
-endif ()
+endif()
 
-if (cuFile_FOUND AND NOT TARGET cuFile::cuFileRDMA)
+if(cuFile_FOUND AND NOT TARGET cuFile::cuFileRDMA)
   add_library(cuFile::cuFileRDMA UNKNOWN IMPORTED)
-  set_target_properties(cuFile::cuFileRDMA PROPERTIES
-    IMPORTED_LOCATION "${cuFileRDMA_LIBRARY}"
-    INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
-    INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
+  set_target_properties(
+    cuFile::cuFileRDMA
+    PROPERTIES IMPORTED_LOCATION "${cuFileRDMA_LIBRARY}"
+               INTERFACE_COMPILE_OPTIONS "${cuFile_COMPILE_OPTIONS}"
+               INTERFACE_INCLUDE_DIRECTORIES "${cuFile_INCLUDE_DIR}"
   )
-endif ()
+endif()
 
 mark_as_advanced(cuFile_LIBRARY cuFileRDMA_LIBRARY cuFile_INCLUDE_DIR)
 
-if (cuFile_FOUND)
+if(cuFile_FOUND)
   set(cuFile_LIBRARIES ${cuFile_LIBRARY})
   set(cuFileRDMA_LIBRARIES ${cuFileRDMA_LIBRARY})
   set(cuFile_INCLUDE_DIRS ${cuFile_INCLUDE_DIR})
-endif ()
+endif()
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 7e2ec5254d3..c2ad25760b8 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -1,67 +1,62 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 # Create `jitify_preprocess` executable
 add_executable(jitify_preprocess "${JITIFY_INCLUDE_DIR}/jitify2_preprocess.cpp")
 
 target_link_libraries(jitify_preprocess CUDA::cudart ${CMAKE_DL_LIBS})
 
+# Take a list of files to JIT-compile and run them through jitify_preprocess.
 function(jit_preprocess_files)
-    cmake_parse_arguments(ARG
-                          ""
-                          "SOURCE_DIRECTORY"
-                          "FILES"
-                          ${ARGN}
-                          )
+  cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN})
 
-    foreach(ARG_FILE ${ARG_FILES})
-        set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp)
-        get_filename_component(jit_output_directory "${ARG_OUTPUT}" DIRECTORY )
-        list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}")
-        add_custom_command(WORKING_DIRECTORY ${ARG_SOURCE_DIRECTORY}
-                           DEPENDS jitify_preprocess "${ARG_SOURCE_DIRECTORY}/${ARG_FILE}"
-                           OUTPUT ${ARG_OUTPUT}
-                           VERBATIM
-                           COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}"
-                           COMMAND jitify_preprocess ${ARG_FILE}
-                                    -o ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files
-                                    -i
-                                    -m
-                                    -std=c++17
-                                    -remove-unused-globals
-                                    -D__CUDACC_RTC__
-                                    -I${CUDF_SOURCE_DIR}/include
-                                    -I${CUDF_SOURCE_DIR}/src
-                                    -I${LIBCUDACXX_INCLUDE_DIR}
-                                    -I${CUDAToolkit_INCLUDE_DIRS}
-                                    --no-preinclude-workarounds
-                                    --no-replace-pragma-once
-                           )
-    endforeach()
-    set(JIT_PREPROCESSED_FILES "${JIT_PREPROCESSED_FILES}" PARENT_SCOPE)
+  get_target_property(libcudacxx_raw_includes libcudacxx::libcudacxx INTERFACE_INCLUDE_DIRECTORIES)
+  foreach(inc IN LISTS libcudacxx_raw_includes)
+    list(APPEND libcudacxx_includes "-I${inc}")
+  endforeach()
+  foreach(ARG_FILE ${ARG_FILES})
+    set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp)
+    get_filename_component(jit_output_directory "${ARG_OUTPUT}" DIRECTORY)
+    list(APPEND JIT_PREPROCESSED_FILES "${ARG_OUTPUT}")
+    add_custom_command(
+      OUTPUT ${ARG_OUTPUT}
+      DEPENDS jitify_preprocess "${ARG_SOURCE_DIRECTORY}/${ARG_FILE}"
+      WORKING_DIRECTORY ${ARG_SOURCE_DIRECTORY}
+      VERBATIM
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}"
+      COMMAND
+        jitify_preprocess ${ARG_FILE} -o
+        ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17
+        -remove-unused-globals -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include
+        -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS}
+        --no-preinclude-workarounds --no-replace-pragma-once
+      COMMENT "Custom command to JIT-compile files."
+    )
+  endforeach()
+  set(JIT_PREPROCESSED_FILES
+      "${JIT_PREPROCESSED_FILES}"
+      PARENT_SCOPE
+  )
 endfunction()
 
-jit_preprocess_files(SOURCE_DIRECTORY      ${CUDF_SOURCE_DIR}/src
-                     FILES                 binaryop/jit/kernel.cu
-                                           transform/jit/masked_udf_kernel.cu
-                                           transform/jit/kernel.cu
-                                           rolling/jit/kernel.cu
-                     )
+jit_preprocess_files(
+  SOURCE_DIRECTORY ${CUDF_SOURCE_DIR}/src FILES binaryop/jit/kernel.cu
+  transform/jit/masked_udf_kernel.cu transform/jit/kernel.cu rolling/jit/kernel.cu
+)
 
-add_custom_target(jitify_preprocess_run DEPENDS ${JIT_PREPROCESSED_FILES})
-
-file(COPY "${LIBCUDACXX_INCLUDE_DIR}/" DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcudacxx")
-file(COPY "${LIBCXX_INCLUDE_DIR}"      DESTINATION "${CUDF_GENERATED_INCLUDE_DIR}/include/libcxx")
+add_custom_target(
+  jitify_preprocess_run
+  DEPENDS ${JIT_PREPROCESSED_FILES}
+  COMMENT "Target representing jitified files."
+)
diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json
index fa3f769ccfb..4f287499503 100644
--- a/cpp/cmake/config.json
+++ b/cpp/cmake/config.json
@@ -25,7 +25,8 @@
     "tab_size": 2,
     "command_case": "unchanged",
     "max_lines_hwrap": 1,
-    "max_pargs_hwrap": 999
+    "max_pargs_hwrap": 999,
+    "dangle_parens": true
   },
   "lint": {
     "disabled_codes": ["C0301"],
diff --git a/cpp/cmake/libcudacxx.patch b/cpp/cmake/libcudacxx.patch
new file mode 100644
index 00000000000..3cdc40ef084
--- /dev/null
+++ b/cpp/cmake/libcudacxx.patch
@@ -0,0 +1,21 @@
+diff --git a/include/cuda/std/detail/__config b/include/cuda/std/detail/__config
+index d55a43688..654142d7e 100644
+--- a/include/cuda/std/detail/__config
++++ b/include/cuda/std/detail/__config
+@@ -23,7 +23,7 @@
+     #define _LIBCUDACXX_CUDACC_VER_MINOR __CUDACC_VER_MINOR__
+     #define _LIBCUDACXX_CUDACC_VER_BUILD __CUDACC_VER_BUILD__
+     #define _LIBCUDACXX_CUDACC_VER                                                  \
+-        _LIBCUDACXX_CUDACC_VER_MAJOR * 10000 + _LIBCUDACXX_CUDACC_VER_MINOR * 100 + \
++        _LIBCUDACXX_CUDACC_VER_MAJOR * 100000 + _LIBCUDACXX_CUDACC_VER_MINOR * 1000 + \
+         _LIBCUDACXX_CUDACC_VER_BUILD
+ 
+     #define _LIBCUDACXX_HAS_NO_LONG_DOUBLE
+@@ -64,7 +64,7 @@
+ #  endif
+ #endif
+ 
+-#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 110500))
++#if defined(_LIBCUDACXX_COMPILER_MSVC) || (defined(_LIBCUDACXX_CUDACC_VER) && (_LIBCUDACXX_CUDACC_VER < 1105000))
+ #  define _LIBCUDACXX_HAS_NO_INT128
+ #endif
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 41751c7dd50..5fe37402096 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -1,163 +1,188 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON ENABLE_PARQUET)
-
-    if(BUILD_STATIC)
-        if(TARGET arrow_static AND TARGET arrow_cuda_static)
-            list(APPEND ARROW_LIBRARIES arrow_static)
-            list(APPEND ARROW_LIBRARIES arrow_cuda_static)
-            set(ARROW_FOUND TRUE PARENT_SCOPE)
-            set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE)
-            return()
-        endif()
-    else()
-        if(TARGET arrow_shared AND TARGET arrow_cuda_shared)
-            list(APPEND ARROW_LIBRARIES arrow_shared)
-            list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
-            set(ARROW_FOUND TRUE PARENT_SCOPE)
-            set(ARROW_LIBRARIES ${ARROW_LIBRARIES} PARENT_SCOPE)
-            return()
-        endif()
-    endif()
-
-    set(ARROW_BUILD_SHARED ON)
-    set(ARROW_BUILD_STATIC OFF)
-    set(CPMAddOrFindPackage CPMFindPackage)
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Finding arrow is far more complex than it should be, and as a result we violate multiple linting
+# rules aiming to limit complexity. Since all our other CMake scripts conform to expectations
+# without undue difficulty, disabling those rules for just this function is our best approach for
+# now. The spacing between this comment, the cmake-lint directives, and the function docstring is
+# necessary to prevent cmake-format from trying to combine the lines.
+
+# cmake-lint: disable=R0912,R0913,R0915
+
+# This function finds arrow and sets any additional necessary environment variables.
+function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
+         ENABLE_PARQUET
+)
 
-    if(NOT ARROW_ARMV8_ARCH)
-        set(ARROW_ARMV8_ARCH "armv8-a")
+  if(BUILD_STATIC)
+    if(TARGET arrow_static AND TARGET arrow_cuda_static)
+      list(APPEND ARROW_LIBRARIES arrow_static)
+      list(APPEND ARROW_LIBRARIES arrow_cuda_static)
+      set(ARROW_FOUND
+          TRUE
+          PARENT_SCOPE
+      )
+      set(ARROW_LIBRARIES
+          ${ARROW_LIBRARIES}
+          PARENT_SCOPE
+      )
+      return()
     endif()
-
-    if(NOT ARROW_SIMD_LEVEL)
-        set(ARROW_SIMD_LEVEL "NONE")
+  else()
+    if(TARGET arrow_shared AND TARGET arrow_cuda_shared)
+      list(APPEND ARROW_LIBRARIES arrow_shared)
+      list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
+      set(ARROW_FOUND
+          TRUE
+          PARENT_SCOPE
+      )
+      set(ARROW_LIBRARIES
+          ${ARROW_LIBRARIES}
+          PARENT_SCOPE
+      )
+      return()
     endif()
-
+  endif()
+
+  set(ARROW_BUILD_SHARED ON)
+  set(ARROW_BUILD_STATIC OFF)
+  set(CPMAddOrFindPackage CPMFindPackage)
+
+  if(NOT ARROW_ARMV8_ARCH)
+    set(ARROW_ARMV8_ARCH "armv8-a")
+  endif()
+
+  if(NOT ARROW_SIMD_LEVEL)
+    set(ARROW_SIMD_LEVEL "NONE")
+  endif()
+
+  if(BUILD_STATIC)
+    set(ARROW_BUILD_STATIC ON)
+    set(ARROW_BUILD_SHARED OFF)
+    # Turn off CPM using `find_package` so we always download and make sure we get proper static
+    # library
+    set(CPM_DOWNLOAD_ALL TRUE)
+  endif()
+
+  set(ARROW_PYTHON_OPTIONS "")
+  if(ENABLE_PYTHON)
+    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
+    # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
+    list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
+    list(APPEND ARROW_PYTHON_OPTIONS "Thrift_SOURCE BUNDLED")
+    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
+  endif()
+
+  # Set this so Arrow correctly finds the CUDA toolkit when the build machine does not have the CUDA
+  # driver installed. This must be an env var.
+  set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")
+
+  rapids_cpm_find(
+    Arrow ${VERSION}
+    GLOBAL_TARGETS arrow_shared arrow_cuda_shared
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow.git
+    GIT_TAG apache-arrow-${VERSION}
+    GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
+            "CUDA_USE_STATIC_CUDA_RUNTIME ${CUDA_STATIC_RUNTIME}"
+            "ARROW_IPC ON"
+            "ARROW_CUDA ON"
+            "ARROW_DATASET ON"
+            "ARROW_WITH_BACKTRACE ON"
+            "ARROW_CXXFLAGS -w"
+            "ARROW_JEMALLOC OFF"
+            "ARROW_S3 ${ENABLE_S3}"
+            "ARROW_ORC ${ENABLE_ORC}"
+            # e.g. needed by blazingsql-io
+            "ARROW_PARQUET ${ENABLE_PARQUET}"
+            ${ARROW_PYTHON_OPTIONS}
+            # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
+            "ARROW_USE_CCACHE OFF"
+            "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
+            "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}"
+            "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}"
+            "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_DEPENDENCY_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_BOOST_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_BROTLI_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_GFLAGS_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_GRPC_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_PROTOBUF_USE_SHARED ${ARROW_BUILD_SHARED}"
+            "ARROW_ZSTD_USE_SHARED ${ARROW_BUILD_SHARED}"
+  )
+
+  set(ARROW_FOUND TRUE)
+  set(ARROW_LIBRARIES "")
+
+  # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR:   set if CPM found Arrow on the
+  # system/conda/etc.
+  if(Arrow_ADDED OR Arrow_DIR)
     if(BUILD_STATIC)
-        set(ARROW_BUILD_STATIC ON)
-        set(ARROW_BUILD_SHARED OFF)
-        # Turn off CPM using `find_package` so we always download
-        # and make sure we get proper static library
-        set(CPM_DOWNLOAD_ALL TRUE)
-    endif()
-
-    set(ARROW_PYTHON_OPTIONS "")
-    if(ENABLE_PYTHON)
-        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
-        # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
-        list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
-        list(APPEND ARROW_PYTHON_OPTIONS "Thrift_SOURCE BUNDLED")
-        list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
+      list(APPEND ARROW_LIBRARIES arrow_static)
+      list(APPEND ARROW_LIBRARIES arrow_cuda_static)
+    else()
+      list(APPEND ARROW_LIBRARIES arrow_shared)
+      list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
     endif()
 
-    # Set this so Arrow correctly finds the CUDA toolkit when the build machine
-    # does not have the CUDA driver installed. This must be an env var.
-    set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")
-
-    rapids_cpm_find(Arrow ${VERSION}
-        GLOBAL_TARGETS arrow_shared arrow_cuda_shared
-        CPM_ARGS
-        GIT_REPOSITORY  https://github.com/apache/arrow.git
-        GIT_TAG         apache-arrow-${VERSION}
-        GIT_SHALLOW     TRUE
-        SOURCE_SUBDIR   cpp
-        OPTIONS         "CMAKE_VERBOSE_MAKEFILE ON"
-                        "CUDA_USE_STATIC_CUDA_RUNTIME ${CUDA_STATIC_RUNTIME}"
-                        "ARROW_IPC ON"
-                        "ARROW_CUDA ON"
-                        "ARROW_DATASET ON"
-                        "ARROW_WITH_BACKTRACE ON"
-                        "ARROW_CXXFLAGS -w"
-                        "ARROW_JEMALLOC OFF"
-                        "ARROW_S3 ${ENABLE_S3}"
-                        "ARROW_ORC ${ENABLE_ORC}"
-                        # e.g. needed by blazingsql-io
-                        "ARROW_PARQUET ${ENABLE_PARQUET}"
-                        ${ARROW_PYTHON_OPTIONS}
-                        # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
-                        "ARROW_USE_CCACHE OFF"
-                        "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
-                        "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}"
-                        "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}"
-                        "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_DEPENDENCY_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_BOOST_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_BROTLI_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_GFLAGS_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_GRPC_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_PROTOBUF_USE_SHARED ${ARROW_BUILD_SHARED}"
-                        "ARROW_ZSTD_USE_SHARED ${ARROW_BUILD_SHARED}")
-
-    set(ARROW_FOUND TRUE)
-    set(ARROW_LIBRARIES "")
-
-    # Arrow_ADDED: set if CPM downloaded Arrow from Github
-    # Arrow_DIR:   set if CPM found Arrow on the system/conda/etc.
-    if(Arrow_ADDED OR Arrow_DIR)
-        if(BUILD_STATIC)
-            list(APPEND ARROW_LIBRARIES arrow_static)
-            list(APPEND ARROW_LIBRARIES arrow_cuda_static)
-        else()
-            list(APPEND ARROW_LIBRARIES arrow_shared)
-            list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
-        endif()
-
-        if(Arrow_DIR)
-            # Set this to enable `find_package(ArrowCUDA)`
-            set(ArrowCUDA_DIR "${Arrow_DIR}")
-            find_package(Arrow REQUIRED QUIET)
-            find_package(ArrowCUDA REQUIRED QUIET)
-        elseif(Arrow_ADDED)
-            # Copy these files so we can avoid adding paths in
-            # Arrow_BINARY_DIR to target_include_directories.
-            # That defeats ccache.
-            file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
-                 DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util")
-            file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
-                 DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu")
-            if(ENABLE_PARQUET)
-                file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
-                     DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet")
-            endif()
-            ###
-            # This shouldn't be necessary!
-            #
-            # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static`
-            # and `arrow_shared` targets in FindArrow and FindArrowCUDA respectively,
-            # so for static source-builds, we have to do it after-the-fact.
-            #
-            # This only works because we know exactly which components we're using.
-            # Don't forget to update this list if we add more!
-            ###
-            foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
-                target_include_directories(${ARROW_LIBRARY}
-                    INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
-                              "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
-                              "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
-                              "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
-                )
-            endforeach()
-        endif()
-    else()
-        set(ARROW_FOUND FALSE)
-        message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
+    if(Arrow_DIR)
+      # Set this to enable `find_package(ArrowCUDA)`
+      set(ArrowCUDA_DIR "${Arrow_DIR}")
+      find_package(Arrow REQUIRED QUIET)
+      find_package(ArrowCUDA REQUIRED QUIET)
+    elseif(Arrow_ADDED)
+      # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
+      # target_include_directories. That defeats ccache.
+      file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
+           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
+      )
+      file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
+           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu"
+      )
+      if(ENABLE_PARQUET)
+        file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
+             DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
+        )
+      endif()
+      #
+      # This shouldn't be necessary!
+      #
+      # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
+      # targets in FindArrow and FindArrowCUDA respectively, so for static source-builds, we have to
+      # do it after-the-fact.
+      #
+      # This only works because we know exactly which components we're using. Don't forget to update
+      # this list if we add more!
+      #
+      foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
+        target_include_directories(
+          ${ARROW_LIBRARY}
+          INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
+                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
+                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
+                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
+        )
+      endforeach()
     endif()
+  else()
+    set(ARROW_FOUND FALSE)
+    message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
+  endif()
 
-    if(Arrow_ADDED)
-        set(arrow_code_string
+  if(Arrow_ADDED)
+    set(arrow_code_string
         [=[
         if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
             add_library(arrow_shared ALIAS cudf::arrow_shared)
@@ -166,8 +191,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             add_library(arrow_static ALIAS cudf::arrow_static)
         endif()
         ]=]
-        )
-        set(arrow_cuda_code_string
+    )
+    set(arrow_cuda_code_string
         [=[
         if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
             add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
@@ -176,49 +201,54 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
         endif()
         ]=]
-        )
-
-        rapids_export(BUILD Arrow
-          VERSION ${VERSION}
-          EXPORT_SET arrow_targets
-          GLOBAL_TARGETS arrow_shared cud
-          NAMESPACE cudf::
-          FINAL_CODE_BLOCK arrow_code_string)
-
-        rapids_export(BUILD ArrowCUDA
-          VERSION ${VERSION}
-          EXPORT_SET arrow_cuda_targets
-          GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static
-          NAMESPACE cudf::
-          FINAL_CODE_BLOCK arrow_cuda_code_string)
-    endif()
-    # We generate the arrow-config and arrowcuda-config files
-    # when we built arrow locally, so always do `find_dependency`
-    rapids_export_package(BUILD Arrow cudf-exports)
-    rapids_export_package(INSTALL Arrow cudf-exports)
-
-    # We have to generate the find_dependency(ArrowCUDA) ourselves
-    # since we need to specify ArrowCUDA_DIR to be where Arrow
-    # was found, since Arrow packages ArrowCUDA.config in a non-standard
-    # location
-    rapids_export_package(BUILD ArrowCUDA cudf-exports)
-
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-    rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-
-    set(ARROW_FOUND "${ARROW_FOUND}" PARENT_SCOPE)
-    set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE)
+    )
+
+    rapids_export(
+      BUILD Arrow
+      VERSION ${VERSION}
+      EXPORT_SET arrow_targets
+      GLOBAL_TARGETS arrow_shared arrow_static
+      NAMESPACE cudf::
+      FINAL_CODE_BLOCK arrow_code_string
+    )
+
+    rapids_export(
+      BUILD ArrowCUDA
+      VERSION ${VERSION}
+      EXPORT_SET arrow_cuda_targets
+      GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static
+      NAMESPACE cudf::
+      FINAL_CODE_BLOCK arrow_cuda_code_string
+    )
+  endif()
+  # We generate the arrow-config and arrowcuda-config files when we built arrow locally, so always
+  # do `find_dependency`
+  rapids_export_package(BUILD Arrow cudf-exports)
+  rapids_export_package(INSTALL Arrow cudf-exports)
+
+  # We have to generate the find_dependency(ArrowCUDA) ourselves since we need to specify
+  # ArrowCUDA_DIR to be where Arrow was found, since Arrow packages ArrowCUDA.config in a
+  # non-standard location
+  rapids_export_package(BUILD ArrowCUDA cudf-exports)
+
+  include("${rapids-cmake-dir}/export/find_package_root.cmake")
+  rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+  rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
+
+  set(ARROW_FOUND
+      "${ARROW_FOUND}"
+      PARENT_SCOPE
+  )
+  set(ARROW_LIBRARIES
+      "${ARROW_LIBRARIES}"
+      PARENT_SCOPE
+  )
 
 endfunction()
 
 set(CUDF_VERSION_Arrow 5.0.0)
 
 find_and_configure_arrow(
-    ${CUDF_VERSION_Arrow}
-    ${CUDF_USE_ARROW_STATIC}
-    ${CUDF_ENABLE_ARROW_S3}
-    ${CUDF_ENABLE_ARROW_ORC}
-    ${CUDF_ENABLE_ARROW_PYTHON}
-    ${CUDF_ENABLE_ARROW_PARQUET}
+  ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC}
+  ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET}
 )
diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 47dbc037334..b6cb9757ae8 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,31 +1,30 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds cucollections and sets any additional necessary environment variables.
 function(find_and_configure_cucollections)
 
-    # Find or install cuCollections
-    rapids_cpm_find(cuco 0.0
-        GLOBAL_TARGETS cuco::cuco
-        CPM_ARGS
-            GITHUB_REPOSITORY NVIDIA/cuCollections
-            GIT_TAG           729857a5698a0e8d8f812e0464f65f37854ae17b
-            OPTIONS           "BUILD_TESTS OFF"
-                              "BUILD_BENCHMARKS OFF"
-                              "BUILD_EXAMPLES OFF"
-    )
+  # Find or install cuCollections
+  rapids_cpm_find(
+    # cuCollections doesn't have a version yet
+    cuco 0.0
+    GLOBAL_TARGETS cuco::cuco
+    CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections
+    GIT_TAG f0eecb203590f1f4ac4a9f1700229f4434ac64dc
+    OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
+  )
+
 endfunction()
 
 find_and_configure_cucollections()
diff --git a/cpp/cmake/thirdparty/get_dlpack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake
index 1334ac91c6e..aeffd64f371 100644
--- a/cpp/cmake/thirdparty/get_dlpack.cmake
+++ b/cpp/cmake/thirdparty/get_dlpack.cmake
@@ -1,35 +1,37 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds dlpack and sets any additional necessary environment variables.
 function(find_and_configure_dlpack VERSION)
 
-    rapids_find_generate_module( DLPACK
-        HEADER_NAMES dlpack.h)
+  rapids_find_generate_module(DLPACK HEADER_NAMES dlpack.h)
 
-    rapids_cpm_find(dlpack ${VERSION}
-        GIT_REPOSITORY  https://github.com/dmlc/dlpack.git
-        GIT_TAG         v${VERSION}
-        GIT_SHALLOW     TRUE
-        DOWNLOAD_ONLY   TRUE
-        OPTIONS         "BUILD_MOCK OFF")
+  rapids_cpm_find(
+    dlpack ${VERSION}
+    GIT_REPOSITORY https://github.com/dmlc/dlpack.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE
+    OPTIONS "BUILD_MOCK OFF"
+  )
 
-    if(DEFINED dlpack_SOURCE_DIR)
-        #otherwise find_package(DLPACK) will set this variable
-        set(DLPACK_INCLUDE_DIR "${dlpack_SOURCE_DIR}/include" PARENT_SCOPE)
-    endif()
+  if(DEFINED dlpack_SOURCE_DIR)
+    # otherwise find_package(DLPACK) will set this variable
+    set(DLPACK_INCLUDE_DIR
+        "${dlpack_SOURCE_DIR}/include"
+        PARENT_SCOPE
+    )
+  endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_dlpack 0.5)
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 9de2b4a50a9..1363f43fae2 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,36 +1,38 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds gtest and sets any additional necessary environment variables.
 function(find_and_configure_gtest)
-    include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
-    # Find or install GoogleTest
-    rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports
-                     INSTALL_EXPORT_SET cudf-testing-exports)
+  # Find or install GoogleTest
+  rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports INSTALL_EXPORT_SET cudf-testing-exports)
 
-    if(GTest_ADDED)
-        rapids_export(BUILD GTest
-          VERSION ${GTest_VERSION}
-          EXPORT_SET GTestTargets
-          GLOBAL_TARGETS gtest gmock gtest_main gmock_main
-          NAMESPACE GTest::)
+  if(GTest_ADDED)
+    rapids_export(
+      BUILD GTest
+      VERSION ${GTest_VERSION}
+      EXPORT_SET GTestTargets
+      GLOBAL_TARGETS gtest gmock gtest_main gmock_main
+      NAMESPACE GTest::
+    )
 
-        include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports)
-    endif()
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(
+      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-testing-exports
+    )
+  endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake
index b8a85889ef2..7c4526107a3 100644
--- a/cpp/cmake/thirdparty/get_jitify.cmake
+++ b/cpp/cmake/thirdparty/get_jitify.cmake
@@ -1,28 +1,31 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
 # Jitify doesn't have a version :/
 
+# This function finds Jitify and sets any additional necessary environment variables.
 function(find_and_configure_jitify)
-    rapids_cpm_find(jitify 2.0.0
-            GIT_REPOSITORY  https://github.com/rapidsai/jitify.git
-            GIT_TAG         cudf_0.19
-            GIT_SHALLOW     TRUE
-            DOWNLOAD_ONLY   TRUE)
-    set(JITIFY_INCLUDE_DIR "${jitify_SOURCE_DIR}" PARENT_SCOPE)
+  rapids_cpm_find(
+    jitify 2.0.0
+    GIT_REPOSITORY https://github.com/rapidsai/jitify.git
+    GIT_TAG cudf_0.19
+    GIT_SHALLOW TRUE DOWNLOAD_ONLY TRUE
+  )
+  set(JITIFY_INCLUDE_DIR
+      "${jitify_SOURCE_DIR}"
+      PARENT_SCOPE
+  )
 endfunction()
 
 find_and_configure_jitify()
diff --git a/cpp/cmake/thirdparty/get_libcudacxx.cmake b/cpp/cmake/thirdparty/get_libcudacxx.cmake
index 4921abe0581..290c4f61e41 100644
--- a/cpp/cmake/thirdparty/get_libcudacxx.cmake
+++ b/cpp/cmake/thirdparty/get_libcudacxx.cmake
@@ -1,31 +1,30 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
-function(find_and_configure_libcudacxx VERSION)
-    rapids_cpm_find(libcudacxx ${VERSION}
-        GIT_REPOSITORY      https://github.com/NVIDIA/libcudacxx.git
-        GIT_TAG             ${VERSION}
-        GIT_SHALLOW         TRUE
-        DOWNLOAD_ONLY       TRUE
-    )
+# This function finds libcudacxx and sets any additional necessary environment variables.
+function(find_and_configure_libcudacxx)
+  include(${rapids-cmake-dir}/cpm/libcudacxx.cmake)
 
-    set(LIBCUDACXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/include" PARENT_SCOPE)
-    set(LIBCXX_INCLUDE_DIR "${libcudacxx_SOURCE_DIR}/libcxx/include" PARENT_SCOPE)
-endfunction()
+  rapids_cpm_libcudacxx(
+    BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports PATCH_COMMAND patch
+    --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/libcudacxx.patch || true
+  )
 
-set(CUDF_MIN_VERSION_libcudacxx 1.4.0)
+  set(LIBCUDACXX_INCLUDE_DIR
+      "${libcudacxx_SOURCE_DIR}/include"
+      PARENT_SCOPE
+  )
+endfunction()
 
-find_and_configure_libcudacxx(${CUDF_MIN_VERSION_libcudacxx})
+find_and_configure_libcudacxx()
diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake
index 16d50fd3388..c1765408d62 100644
--- a/cpp/cmake/thirdparty/get_nvcomp.cmake
+++ b/cpp/cmake/thirdparty/get_nvcomp.cmake
@@ -1,36 +1,37 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds nvcomp and sets any additional necessary environment variables.
 function(find_and_configure_nvcomp VERSION)
 
-    # Find or install nvcomp
-    rapids_cpm_find(nvcomp ${VERSION}
-        GLOBAL_TARGETS     nvcomp::nvcomp
-        CPM_ARGS
-            GITHUB_REPOSITORY  NVIDIA/nvcomp
-            GIT_TAG            aa003db89e052e4ce408910ff17e1054b7c43b7d
-            OPTIONS            "BUILD_STATIC ON"
-                               "BUILD_TESTS OFF"
-                               "BUILD_BENCHMARKS OFF"
-                               "BUILD_EXAMPLES OFF"
-    )
+  # Find or install nvcomp
+  rapids_cpm_find(
+    nvcomp ${VERSION}
+    GLOBAL_TARGETS nvcomp::nvcomp
+    CPM_ARGS GITHUB_REPOSITORY NVIDIA/nvcomp
+    GIT_TAG c435afaf4ba8a8d12f379d688effcb185886cec1
+    OPTIONS "BUILD_STATIC ON" "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF"
+  )
 
-    if(NOT TARGET nvcomp::nvcomp)
-        add_library(nvcomp::nvcomp ALIAS nvcomp)
-    endif()
+  if(NOT TARGET nvcomp::nvcomp)
+    add_library(nvcomp::nvcomp ALIAS nvcomp)
+  endif()
+
+  # Per-thread default stream
+  if(TARGET nvcomp AND PER_THREAD_DEFAULT_STREAM)
+    target_compile_definitions(nvcomp PRIVATE CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  endif()
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_rmm.cmake b/cpp/cmake/thirdparty/get_rmm.cmake
index ec40afa4d05..854bd3d1149 100644
--- a/cpp/cmake/thirdparty/get_rmm.cmake
+++ b/cpp/cmake/thirdparty/get_rmm.cmake
@@ -1,25 +1,23 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds rmm and sets any additional necessary environment variables.
 function(find_and_configure_rmm)
-    include(${rapids-cmake-dir}/cpm/rmm.cmake)
+  include(${rapids-cmake-dir}/cpm/rmm.cmake)
 
-    # Find or install RMM
-    rapids_cpm_rmm(BUILD_EXPORT_SET cudf-exports
-                   INSTALL_EXPORT_SET cudf-exports)
+  # Find or install RMM
+  rapids_cpm_rmm(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 9cb8f35bda4..574bfa26a0c 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -1,74 +1,83 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds thrust and sets any additional necessary environment variables.
 function(find_and_configure_thrust VERSION)
-    # We only want to set `UPDATE_DISCONNECTED` while
-    # the GIT tag hasn't moved from the last time we cloned
-    set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE")
-    set(CPM_THRUST_CURRENT_VERSION ${VERSION} CACHE STRING "version of thrust we checked out")
-    if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION)
-        set(CPM_THRUST_CURRENT_VERSION ${VERSION} CACHE STRING "version of thrust we checked out" FORCE)
-        set(cpm_thrust_disconnect_update "")
-    endif()
+  # We only want to set `UPDATE_DISCONNECTED` while the GIT tag hasn't moved from the last time we
+  # cloned
+  set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE")
+  set(CPM_THRUST_CURRENT_VERSION
+      ${VERSION}
+      CACHE STRING "version of thrust we checked out"
+  )
+  if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION)
+    set(CPM_THRUST_CURRENT_VERSION
+        ${VERSION}
+        CACHE STRING "version of thrust we checked out" FORCE
+    )
+    set(cpm_thrust_disconnect_update "")
+  endif()
 
-    # We currently require cuDF to always build with a custom
-    # version of thrust. This is needed so that build times of
-    # of cudf are kept reasonable, without this CI builds
-    # of cudf will be killed as some source file can take
-    # over 45 minutes to build
-    #
-    set(CPM_DOWNLOAD_ALL TRUE)
-    rapids_cpm_find(
-        Thrust ${VERSION}
-        BUILD_EXPORT_SET cudf-exports
-        INSTALL_EXPORT_SET cudf-exports
-        CPM_ARGS
-        GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
-        GIT_TAG ${VERSION}
-        GIT_SHALLOW TRUE
-        ${cpm_thrust_disconnect_update}
-        PATCH_COMMAND   patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true
-        OPTIONS "THRUST_INSTALL TRUE")
+  # We currently require cuDF to always build with a custom version of thrust. This is needed so
+  # that build times of of cudf are kept reasonable, without this CI builds of cudf will be killed
+  # as some source file can take over 45 minutes to build
+  #
+  set(CPM_DOWNLOAD_ALL TRUE)
+  rapids_cpm_find(
+    Thrust ${VERSION}
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
+    GIT_TAG ${VERSION}
+    GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update} PATCH_COMMAND patch --reject-file=- -p1 -N <
+                ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true
+    OPTIONS "THRUST_INSTALL TRUE"
+  )
 
-    if(NOT TARGET cudf::Thrust)
-        thrust_create_target(cudf::Thrust FROM_OPTIONS)
-    endif()
+  if(NOT TARGET cudf::Thrust)
+    thrust_create_target(cudf::Thrust FROM_OPTIONS)
+  endif()
 
-    if(Thrust_SOURCE_DIR) # only install thrust when we have an in-source version
-        include(GNUInstallDirs)
-        install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/"
-            FILES_MATCHING
-                PATTERN "*.h"
-                PATTERN "*.inl")
-        install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/"
-            FILES_MATCHING
-                PATTERN "*.cuh")
+  if(Thrust_SOURCE_DIR) # only install thrust when we have an in-source version
+    include(GNUInstallDirs)
+    install(
+      DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
+      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/"
+      FILES_MATCHING
+      REGEX "\\.(h|inl)$"
+    )
+    install(
+      DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
+      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/"
+      FILES_MATCHING
+      PATTERN "*.cuh"
+    )
 
-        install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/")
-        install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/")
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake"
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/"
+    )
+    install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake"
+            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/"
+    )
 
-
-        # Store where CMake can find our custom Thrust install
-        include("${rapids-cmake-dir}/export/find_package_root.cmake")
-        rapids_export_find_package_root(INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports)
-    endif()
+    # Store where CMake can find our custom Thrust install
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(
+      INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports
+    )
+  endif()
 endfunction()
 
 set(CUDF_MIN_VERSION_Thrust 1.12.0)
diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md
index 1da2d43cf6c..18860504bf1 100644
--- a/cpp/docs/DEVELOPER_GUIDE.md
+++ b/cpp/docs/DEVELOPER_GUIDE.md
@@ -95,7 +95,7 @@ class utility_class
   std::unique_ptr<cudf::column> _column{};
 }
 
-TYPED_TEST_CASE(RepeatTypedTestFixture, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(RepeatTypedTestFixture, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(RepeatTypedTestFixture, RepeatScalarCount)
 {
diff --git a/cpp/docs/TESTING.md b/cpp/docs/TESTING.md
index 3c741b5d4e7..1bdf9c208d8 100644
--- a/cpp/docs/TESTING.md
+++ b/cpp/docs/TESTING.md
@@ -84,7 +84,7 @@ For example:
 template <typename T>
 class TypedTestFixture : cudf::test::BaseFixture {...};
 using TestTypes = cudf::test:Types<int,float,double>; // Notice custom cudf type list type
-TYPED_TEST_CASE(TypedTestFixture, TestTypes);
+TYPED_TEST_SUITE(TypedTestFixture, TestTypes);
 TYPED_TEST(TypedTestFixture, FirstTest){
     // Access the current type using `TypeParam`
     using T = TypeParam;
@@ -107,7 +107,7 @@ element type that libcudf supports.
 #include <cudf_test/type_lists.hpp>
 
 // All tests using TypeTestFixture will be invoked once for each numeric type
-TYPED_TEST_CASE(TypedTestFixture, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(TypedTestFixture, cudf::test::NumericTypes);
 ```
 
 Whenever possible, use one of the type list provided in `include/utilities/test/type_lists.hpp` 
@@ -131,7 +131,7 @@ template <typename TwoTypes>
 TwoTypesFixture : BaseFixture{...};
 using TwoTypesList = Types< Types<int, int>, Types<int, float>, 
                             Types<float, int>, Types<float, float> >;
-TYPED_TEST_CASE(TwoTypesFixture, TwoTypesList);
+TYPED_TEST_SUITE(TwoTypesFixture, TwoTypesList);
 TYPED_TEST(TwoTypesFixture, FirstTest){
     // TypeParam is a list of two types, i.e., a "nested" type list
     // Use `cudf::test::GetType` to retrieve the individual types
diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index ad193d2f8b1..b721448b45a 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -1,6 +1,6 @@
 # Regex Features
 
-This page specifies which regex features are currently supported by libcudf strings column APIs that accept regex patterns:
+This page specifies which regular expression (regex) features are currently supported by libcudf strings column APIs that accept regex patterns:
 
 - cudf::strings::contains_re()
 - cudf::strings::matches_re()
@@ -14,6 +14,13 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
+- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
+- Unmatched paired special characters like `()`, `[]`, and `{}`.
+- Empty groups, classes, or quantifiers. That is, `()` and `[]` without an enclosing expression and `{}` without a valid integer.
+- Incomplete ranges in character classes like `[-z]`, `[a-]`, and `[-]`.
+- Unqualified quantifiers. That is, a quantifier with no preceding item to match like `*a`, `a⎮?`, `(+)`, `{2}a`, etc.
+
 ## Features Supported
 
 ### Characters
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 4175b34ff40..df44ac31d90 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,20 +1,29 @@
 cmake_minimum_required(VERSION 3.18)
 
-project(basic_example VERSION 0.0.1 LANGUAGES CXX CUDA)
+project(
+  basic_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
 
 set(CPM_DOWNLOAD_VERSION v0.32.2)
-file(DOWNLOAD https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
 set(CUDF_TAG branch-21.12)
-CPMFindPackage(NAME  cudf
-    GIT_REPOSITORY  https://github.com/rapidsai/cudf
-    GIT_TAG         ${CUDF_TAG}
-    GIT_SHALLOW     TRUE
-    SOURCE_SUBDIR   cpp
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
 )
 
-
 # Configure your project here
 add_executable(basic_example src/process_csv.cpp)
 target_link_libraries(basic_example PRIVATE cudf::cudf)
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index cd469af0036..1d6e718717d 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -42,7 +42,8 @@ std::unique_ptr<cudf::table> average_closing_price(cudf::table_view stock_info_t
 
   // Compute the average of each company's closing price with entire column
   cudf::groupby::groupby grpby_obj(keys);
-  auto requests = make_single_aggregation_request(cudf::make_mean_aggregation<cudf::groupby_aggregation>(), val);
+  auto requests =
+    make_single_aggregation_request(cudf::make_mean_aggregation<cudf::groupby_aggregation>(), val);
 
   auto agg_results = grpby_obj.aggregate(requests);
 
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 8ee74db7adc..374af536dc5 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -87,7 +87,9 @@ class aggregation {
     CUDA,            ///< CUDA UDF based reduction
     MERGE_LISTS,     ///< merge multiple lists values into one list
     MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
-    MERGE_M2,        ///< merge partial values of M2 aggregation
+    MERGE_M2,        ///< merge partial values of M2 aggregation,
+    COVARIANCE,      ///< covariance between two sets of elements
+    CORRELATION,     ///< correlation between two sets of elements
     TDIGEST,         ///< create a tdigest from a set of input values
     MERGE_TDIGEST    ///< create a tdigest by merging multiple tdigests together
   };
@@ -146,6 +148,7 @@ class groupby_scan_aggregation : public virtual aggregation {
 };
 
 enum class udf_type : bool { CUDA, PTX };
+enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
 
 /// Factory to create a SUM aggregation
 template <typename Base = aggregation>
@@ -495,6 +498,31 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = nu
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_merge_m2_aggregation();
 
+/**
+ * @brief Factory to create a COVARIANCE aggregation
+ *
+ * Compute covariance between two columns.
+ * The input columns are child columns of a non-nullable struct columns.
+ * @param min_periods Minimum number of non-null observations required to produce a result.
+ * @param ddof Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N is
+ *        the number of non-null observations.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods = 1, size_type ddof = 1);
+
+/**
+ * @brief Factory to create a CORRELATION aggregation
+ *
+ * Compute correlation coefficient between two columns.
+ * The input columns are child columns of a non-nullable struct columns.
+ *
+ * @param type correlation_type
+ * @param min_periods Minimum number of non-null observations required to produce a result.
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_correlation_aggregation(correlation_type type,
+                                                   size_type min_periods = 1);
+
 /**
  * @brief Factory to create a TDIGEST aggregation
  *
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 19df8d8e7b6..cffefcaf9cd 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -192,6 +192,15 @@ CUDA_HOST_DEVICE_CALLABLE constexpr void ast_operator_dispatcher(ast_operator op
     case ast_operator::NOT:
       f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
       break;
+    case ast_operator::CAST_TO_INT64:
+      f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
+      break;
+    case ast_operator::CAST_TO_UINT64:
+      f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
+      break;
+    case ast_operator::CAST_TO_FLOAT64:
+      f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
+      break;
     default:
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
@@ -780,6 +789,26 @@ struct operator_functor<ast_operator::NOT, false> {
   }
 };
 
+template <typename To>
+struct cast {
+  static constexpr auto arity{1};
+  template <typename From>
+  CUDA_DEVICE_CALLABLE auto operator()(From f) -> decltype(static_cast<To>(f))
+  {
+    return static_cast<To>(f);
+  }
+};
+
+template <>
+struct operator_functor<ast_operator::CAST_TO_INT64, false> : cast<int64_t> {
+};
+template <>
+struct operator_functor<ast_operator::CAST_TO_UINT64, false> : cast<uint64_t> {
+};
+template <>
+struct operator_functor<ast_operator::CAST_TO_FLOAT64, false> : cast<double> {
+};
+
 /*
  * The default specialization of nullable operators is to fall back to the non-nullable
  * implementation
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 5454f9a2b95..7ae40a7d65f 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -88,29 +88,32 @@ enum class ast_operator {
                      ///< NULL_LOGICAL_OR(null, false) is null, and NULL_LOGICAL_OR(valid, valid) ==
                      ///< LOGICAL_OR(valid, valid)
   // Unary operators
-  IDENTITY,    ///< Identity function
-  SIN,         ///< Trigonometric sine
-  COS,         ///< Trigonometric cosine
-  TAN,         ///< Trigonometric tangent
-  ARCSIN,      ///< Trigonometric sine inverse
-  ARCCOS,      ///< Trigonometric cosine inverse
-  ARCTAN,      ///< Trigonometric tangent inverse
-  SINH,        ///< Hyperbolic sine
-  COSH,        ///< Hyperbolic cosine
-  TANH,        ///< Hyperbolic tangent
-  ARCSINH,     ///< Hyperbolic sine inverse
-  ARCCOSH,     ///< Hyperbolic cosine inverse
-  ARCTANH,     ///< Hyperbolic tangent inverse
-  EXP,         ///< Exponential (base e, Euler number)
-  LOG,         ///< Natural Logarithm (base e)
-  SQRT,        ///< Square-root (x^0.5)
-  CBRT,        ///< Cube-root (x^(1.0/3))
-  CEIL,        ///< Smallest integer value not less than arg
-  FLOOR,       ///< largest integer value not greater than arg
-  ABS,         ///< Absolute value
-  RINT,        ///< Rounds the floating-point argument arg to an integer value
-  BIT_INVERT,  ///< Bitwise Not (~)
-  NOT          ///< Logical Not (!)
+  IDENTITY,        ///< Identity function
+  SIN,             ///< Trigonometric sine
+  COS,             ///< Trigonometric cosine
+  TAN,             ///< Trigonometric tangent
+  ARCSIN,          ///< Trigonometric sine inverse
+  ARCCOS,          ///< Trigonometric cosine inverse
+  ARCTAN,          ///< Trigonometric tangent inverse
+  SINH,            ///< Hyperbolic sine
+  COSH,            ///< Hyperbolic cosine
+  TANH,            ///< Hyperbolic tangent
+  ARCSINH,         ///< Hyperbolic sine inverse
+  ARCCOSH,         ///< Hyperbolic cosine inverse
+  ARCTANH,         ///< Hyperbolic tangent inverse
+  EXP,             ///< Exponential (base e, Euler number)
+  LOG,             ///< Natural Logarithm (base e)
+  SQRT,            ///< Square-root (x^0.5)
+  CBRT,            ///< Cube-root (x^(1.0/3))
+  CEIL,            ///< Smallest integer value not less than arg
+  FLOOR,           ///< largest integer value not greater than arg
+  ABS,             ///< Absolute value
+  RINT,            ///< Rounds the floating-point argument arg to an integer value
+  BIT_INVERT,      ///< Bitwise Not (~)
+  NOT,             ///< Logical Not (!)
+  CAST_TO_INT64,   ///< Cast value to int64_t
+  CAST_TO_UINT64,  ///< Cast value to uint64_t
+  CAST_TO_FLOAT64  ///< Cast value to double
 };
 
 /**
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index f99d4fa2d4e..5e89e1c7baf 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -40,6 +40,16 @@ namespace cudf {
  */
 std::unique_ptr<column> make_empty_column(data_type type);
 
+/**
+ * @brief Creates an empty column of the specified type.
+ *
+ * An empty column contains zero elements and no validity mask.
+ *
+ * @param[in] id The column type id
+ * @return Empty column with specified type
+ */
+std::unique_ptr<column> make_empty_column(type_id id);
+
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
  * specified numeric `data_type` with an optional null mask.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 06768bdeb35..ba5043fb261 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -407,10 +407,15 @@ std::unique_ptr<column> shift(
  * the range [0, input.size()).
  *
  * @param input View of column to slice
- * @param indices A vector of indices used to take slices of `input`.
+ * @param indices Indices used to take slices of `input`.
  * @return Vector of views of `input` indicated by the ranges in `indices`.
  */
-std::vector<column_view> slice(column_view const& input, std::vector<size_type> const& indices);
+std::vector<column_view> slice(column_view const& input, host_span<size_type const> indices);
+/**
+ * @ingroup copy_slice
+ * @copydoc cudf::slice(column_view const&, host_span<size_type const>)
+ */
+std::vector<column_view> slice(column_view const& input, std::initializer_list<size_type> indices);
 
 /**
  * @brief Slices a `table_view` into a set of `table_view`s according to a set of indices.
@@ -441,10 +446,15 @@ std::vector<column_view> slice(column_view const& input, std::vector<size_type>
  * the range [0, input.size()).
  *
  * @param input View of table to slice
- * @param indices A vector of indices used to take slices of `input`.
+ * @param indices Indices used to take slices of `input`.
  * @return Vector of views of `input` indicated by the ranges in `indices`.
  */
-std::vector<table_view> slice(table_view const& input, std::vector<size_type> const& indices);
+std::vector<table_view> slice(table_view const& input, host_span<size_type const> indices);
+/**
+ * @ingroup copy_slice
+ * @copydoc cudf::slice(table_view const&, host_span<size_type const>)
+ */
+std::vector<table_view> slice(table_view const& input, std::initializer_list<size_type> indices);
 
 /**
  * @brief Splits a `column_view` into a set of `column_view`s according to a set of indices
@@ -475,10 +485,15 @@ std::vector<table_view> slice(table_view const& input, std::vector<size_type> co
  * @throws cudf::logic_error When the values in the `splits` are 'strictly decreasing'.
  *
  * @param input View of column to split
- * @param splits A vector of indices where the view will be split
+ * @param splits Indices where the view will be split
  * @return The set of requested views of `input` indicated by the `splits`.
  */
-std::vector<column_view> split(column_view const& input, std::vector<size_type> const& splits);
+std::vector<column_view> split(column_view const& input, host_span<size_type const> splits);
+/**
+ * @ingroup copy_split
+ * @copydoc cudf::split(column_view const&, host_span<size_type const>)
+ */
+std::vector<column_view> split(column_view const& input, std::initializer_list<size_type> splits);
 
 /**
  * @brief Splits a `table_view` into a set of `table_view`s according to a set of indices
@@ -511,10 +526,15 @@ std::vector<column_view> split(column_view const& input, std::vector<size_type>
  * @throws cudf::logic_error When the values in the `splits` are 'strictly decreasing'.
  *
  * @param input View of a table to split
- * @param splits A vector of indices where the view will be split
+ * @param splits Indices where the view will be split
  * @return The set of requested views of `input` indicated by the `splits`.
  */
-std::vector<table_view> split(table_view const& input, std::vector<size_type> const& splits);
+std::vector<table_view> split(table_view const& input, host_span<size_type const> splits);
+/**
+ * @ingroup copy_split
+ * @copydoc cudf::split(table_view const&, host_span<size_type const>)
+ */
+std::vector<table_view> split(table_view const& input, std::initializer_list<size_type> splits);
 
 /**
  * @brief Column data in a serialized format
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 05d1bf3e595..69bde7f57fd 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -91,6 +91,10 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class merge_sets_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_m2_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class covariance_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class correlation_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class tdigest_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
@@ -129,6 +133,8 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
+  virtual void visit(class covariance_aggregation const& agg);
+  virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
   virtual void visit(class merge_tdigest_aggregation const& agg);
 };
@@ -890,6 +896,77 @@ class merge_m2_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived aggregation class for specifying COVARIANCE aggregation
+ */
+class covariance_aggregation final : public groupby_aggregation {
+ public:
+  explicit covariance_aggregation(size_type min_periods, size_type ddof)
+    : aggregation{COVARIANCE}, _min_periods{min_periods}, _ddof(ddof)
+  {
+  }
+  size_type _min_periods;
+  size_type _ddof;
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<covariance_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<size_type>{}(_min_periods) ^ std::hash<size_type>{}(_ddof);
+  }
+};
+
+/**
+ * @brief Derived aggregation class for specifying CORRELATION aggregation
+ */
+class correlation_aggregation final : public groupby_aggregation {
+ public:
+  explicit correlation_aggregation(correlation_type type, size_type min_periods)
+    : aggregation{CORRELATION}, _type{type}, _min_periods{min_periods}
+  {
+  }
+  correlation_type _type;
+  size_type _min_periods;
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<correlation_aggregation const&>(_other);
+    return (_type == other._type);
+  }
+
+  size_t do_hash() const override { return this->aggregation::do_hash() ^ hash_impl(); }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<correlation_aggregation>(*this);
+  }
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+
+ protected:
+  size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_type)) ^ std::hash<size_type>{}(_min_periods);
+  }
+};
+
 /**
  * @brief Derived aggregation class for specifying TDIGEST aggregation
  */
@@ -1174,6 +1251,18 @@ struct target_type_impl<SourceType, aggregation::MERGE_M2> {
   using type = struct_view;
 };
 
+// Always use double for COVARIANCE
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::COVARIANCE> {
+  using type = double;
+};
+
+// Always use double for CORRELATION
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::CORRELATION> {
+  using type = double;
+};
+
 // Always use numeric types for TDIGEST
 template <typename Source>
 struct target_type_impl<Source,
@@ -1296,6 +1385,10 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin
       return f.template operator()<aggregation::MERGE_SETS>(std::forward<Ts>(args)...);
     case aggregation::MERGE_M2:
       return f.template operator()<aggregation::MERGE_M2>(std::forward<Ts>(args)...);
+    case aggregation::COVARIANCE:
+      return f.template operator()<aggregation::COVARIANCE>(std::forward<Ts>(args)...);
+    case aggregation::CORRELATION:
+      return f.template operator()<aggregation::CORRELATION>(std::forward<Ts>(args)...);
     case aggregation::TDIGEST:
       return f.template operator()<aggregation::TDIGEST>(std::forward<Ts>(args)...);
     case aggregation::MERGE_TDIGEST:
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
new file mode 100644
index 00000000000..00742db7982
--- /dev/null
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/datetime_ops.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace detail {
+struct calendrical_month_sequence_functor {
+  template <typename T>
+  typename std::enable_if_t<cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>>
+  operator()(size_type n,
+             scalar const& input,
+             size_type months,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  {
+    // Return empty column if n = 0
+    if (n == 0) return cudf::make_empty_column(input.type());
+
+    auto const device_input =
+      get_scalar_device_view(static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(input)));
+    auto output_column_type = cudf::data_type{cudf::type_to_id<T>()};
+    auto output             = cudf::make_fixed_width_column(
+      output_column_type, n, cudf::mask_state::UNALLOCATED, stream, mr);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(n),
+                      output->mutable_view().begin<T>(),
+                      [initial = device_input, months] __device__(size_type i) {
+                        return datetime::detail::add_calendrical_months_with_scale_back(
+                          initial.value(), cuda::std::chrono::months{i * months});
+                      });
+
+    return output;
+  }
+
+  template <typename T, typename... Args>
+  typename std::enable_if_t<!cudf::is_timestamp_t<T>::value, std::unique_ptr<cudf::column>>
+  operator()(Args&&...)
+  {
+    CUDF_FAIL("Cannot make a date_range of a non-datetime type");
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 9f06661c8d1..50157d16876 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -19,10 +19,13 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <initializer_list>
+
 namespace cudf {
 namespace detail {
 /**
@@ -67,21 +70,71 @@ ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type
 }
 
 /**
- * @copydoc cudf::slice(column_view const&,std::vector<size_type> const&)
+ * @copydoc cudf::slice(column_view const&, host_span<size_type const>)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::vector<column_view> slice(column_view const& input,
-                               std::vector<size_type> const& indices,
+                               host_span<size_type const> indices,
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+/**
+ * @copydoc cudf::slice(column_view const&, std::initializer_list<size_type>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<column_view> slice(column_view const& input,
+                               std::initializer_list<size_type> indices,
                                rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
- * @copydoc cudf::slice(table_view const&,std::vector<size_type> const&)
+ * @copydoc cudf::slice(table_view const&, host_span<size_type const>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> slice(table_view const& input,
+                              host_span<size_type const> indices,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+/**
+ * @copydoc cudf::slice(table_view const&, std::initializer_list<size_type>)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::vector<table_view> slice(table_view const& input,
-                              std::vector<size_type> const& indices,
+                              std::initializer_list<size_type> indices,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::split(column_view const&, host_span<size_type const>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<column_view> split(column_view const& input,
+                               host_span<size_type const> splits,
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+/**
+ * @copydoc cudf::split(column_view const&, std::initializer_list<size_type>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<column_view> split(column_view const& input,
+                               std::initializer_list<size_type> splits,
+                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+
+/**
+ * @copydoc cudf::split(table_view const&, host_span<size_type const>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> split(table_view const& input,
+                              host_span<size_type const> splits,
+                              rmm::cuda_stream_view stream = rmm::cuda_stream_default);
+/**
+ * @copydoc cudf::split(table_view const&, std::initializer_list<size_type>)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::vector<table_view> split(table_view const& input,
+                              std::initializer_list<size_type> splits,
                               rmm::cuda_stream_view stream = rmm::cuda_stream_default);
 
 /**
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index d794adceec2..83c3b89717e 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -18,19 +18,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_view.hpp>
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/copy_if_else.cuh>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_scalar.hpp>
 
-#include <cub/cub.cuh>
-
 namespace cudf {
 namespace detail {
 namespace {  // anonymous
@@ -40,7 +33,7 @@ template <size_type block_size,
           typename LeftIter,
           typename RightIter,
           typename Filter,
-          bool has_validity>
+          bool has_nulls>
 __launch_bounds__(block_size) __global__
   void copy_if_else_kernel(LeftIter lhs,
                            RightIter rhs,
@@ -71,23 +64,14 @@ __launch_bounds__(block_size) __global__
   size_type warp_cur = warp_begin + warp_id;
   size_type index    = tid;
   while (warp_cur <= warp_end) {
-    bool in_range = (index >= begin && index < end);
-
-    bool valid = true;
-    if (has_validity) {
-      valid = in_range && (filter(index) ? thrust::get<1>(lhs[index]) : thrust::get<1>(rhs[index]));
-    }
-
-    // do the copy if-else
-    if (in_range) {
-      out.element<T>(index) = filter(index) ? static_cast<T>(thrust::get<0>(lhs[index]))
-                                            : static_cast<T>(thrust::get<0>(rhs[index]));
-    }
+    auto const opt_value =
+      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
+    if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
 
     // update validity
-    if (has_validity) {
+    if (has_nulls) {
       // the final validity mask for this warp
-      int warp_mask = __ballot_sync(0xFFFF'FFFF, valid && in_range);
+      int warp_mask = __ballot_sync(0xFFFF'FFFF, opt_value.has_value());
       // only one guy in the warp needs to update the mask and count
       if (lane_id == 0) {
         out.set_mask_word(warp_cur, warp_mask);
@@ -100,7 +84,7 @@ __launch_bounds__(block_size) __global__
     index += block_size * gridDim.x;
   }
 
-  if (has_validity) {
+  if (has_nulls) {
     // sum all null counts across all warps
     size_type block_valid_count =
       single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_count);
@@ -168,8 +152,8 @@ std::unique_ptr<column> copy_if_else(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  using Element =
-    typename thrust::tuple_element<0, typename thrust::iterator_traits<LeftIter>::value_type>::type;
+  // This is the type of the thrust::optional element in the passed iterators
+  using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
   size_type num_els        = cudf::util::round_up_safe(size, warp_size);
diff --git a/cpp/include/cudf/detail/datetime_ops.cuh b/cpp/include/cudf/detail/datetime_ops.cuh
new file mode 100644
index 00000000000..e68785eaa3a
--- /dev/null
+++ b/cpp/include/cudf/detail/datetime_ops.cuh
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/std/chrono>
+
+namespace cudf {
+namespace datetime {
+namespace detail {
+using namespace cuda::std::chrono;
+
+template <typename Timestamp>
+__device__ Timestamp add_calendrical_months_with_scale_back(Timestamp time_val, months months_val)
+{
+  auto const days_since_epoch = floor<days>(time_val);
+
+  auto const date = [&]() {
+    auto const ymd = year_month_day{days_since_epoch} + months_val;
+    return ymd.ok() ? ymd : ymd.year() / ymd.month() / last;
+  }();
+
+  auto const time = (time_val - days_since_epoch);
+
+  return sys_days{date} + time;
+}
+
+}  // namespace detail
+}  // namespace datetime
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5d649e55389..594191d275d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -401,7 +401,7 @@ struct column_gatherer_impl<dictionary32> {
   {
     dictionary_column_view dictionary(source_column);
     auto output_count = std::distance(gather_map_begin, gather_map_end);
-    if (output_count == 0) return make_empty_column(data_type{type_id::DICTIONARY32});
+    if (output_count == 0) return make_empty_column(type_id::DICTIONARY32);
     // The gather could cause some keys to be abandoned -- no indices point to them.
     // In this case, we could do further work to remove the abandoned keys and
     // reshuffle the indices values.
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 1e36b2b2797..8705bbd29cb 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
@@ -218,7 +219,7 @@ struct sort_groupby_helper {
   column_ptr _keys_bitmask_column;   ///< Column representing rows with one or more nulls values
   table_view _keys;                  ///< Input keys to sort by
   table_view _unflattened_keys;      ///< Input keys, unflattened and possibly nested
-  std::vector<column_ptr> _struct_null_vectors;  ///< Null vectors for struct columns in _keys
+  structs::detail::flattened_table _flattened;  ///< Support datastructures for _keys
 
   index_vector_ptr
     _group_offsets;  ///< Indices into sorted _keys indicating starting index of each groups
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 7a76f9cab88..c9e1dc0c776 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/quantiles.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -52,13 +53,13 @@ std::unique_ptr<table> quantiles(
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::percentile_approx(column_view const&, column_view const&,
+ * @copydoc cudf::percentile_approx(tdigest_column_view const&, column_view const&,
  * rmm::mr::device_memory_resource*)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> percentile_approx(
-  column_view const& input,
+  tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 248af51623e..3be2798ceb7 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -96,5 +96,15 @@ std::unique_ptr<column> find_and_replace_all(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::normalize_nans_and_zeros
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> normalize_nans_and_zeros(
+  column_view const& input,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index d71a8d0ec24..ec9078a4380 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -196,7 +196,7 @@ struct column_scatterer_impl<dictionary32> {
                                      rmm::mr::device_memory_resource* mr) const
   {
     if (target_in.is_empty())  // empty begets empty
-      return make_empty_column(data_type{type_id::DICTIONARY32});
+      return make_empty_column(type_id::DICTIONARY32);
     if (source_in.is_empty())  // no input, just make a copy
       return std::make_unique<column>(target_in, stream, mr);
 
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 6ba46219166..99852e0af84 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -51,5 +51,20 @@ std::unique_ptr<column> sequence(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::calendrical_month_sequence(size_type size,
+ *                                           scalar const& init,
+ *                                           size_type months,
+ *                                           rmm::mr::device_memory_resource* mr)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<cudf::column> calendrical_month_sequence(
+  size_type size,
+  scalar const& init,
+  size_type months,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index b5dfb34c043..3aa85e87b1d 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -76,6 +76,19 @@ std::unique_ptr<column> segmented_sorted_order(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::stable_segmented_sorted_order
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> stable_segmented_sorted_order(
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::segmented_sort_by_key
  *
@@ -90,6 +103,20 @@ std::unique_ptr<table> segmented_sort_by_key(
   rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::stable_segmented_sort_by_key
+ *
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<table> stable_segmented_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::sort
  *
diff --git a/cpp/src/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
similarity index 58%
rename from cpp/src/structs/utilities.hpp
rename to cpp/include/cudf/detail/structs/utilities.hpp
index 24b80b58669..aece79107c6 100644
--- a/cpp/src/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
 
 namespace cudf {
 namespace structs {
@@ -64,6 +66,71 @@ std::vector<std::vector<column_view>> extract_ordered_struct_children(
  */
 bool is_or_has_nested_lists(cudf::column_view const& col);
 
+/**
+ * @brief Result of `flatten_nested_columns()`, where all `STRUCT` columns are replaced with
+ * their non-nested member columns, and `BOOL8` columns for their null masks.
+ *
+ * `flatten_nested_columns()` produces a "flattened" table_view with all `STRUCT` columns
+ * replaced with their child column_views, preceded by their null masks.
+ * All newly allocated columns and device_buffers that back the returned table_view
+ * are also encapsulated in `flatten_result`.
+ *
+ * Objects of `flatten_result` need to kept alive while its table_view is accessed.
+ */
+class flattened_table {
+ public:
+  /**
+   * @brief Constructor, to be used from `flatten_nested_columns()`.
+   *
+   * @param flattened_columns_ table_view resulting from `flatten_nested_columns()`
+   * @param orders_ Per-column ordering of the table_view
+   * @param null_orders_ Per-column null_order of the table_view
+   * @param columns_ Newly allocated columns to back the table_view
+   * @param null_masks_ Newly allocated null masks to back the table_view
+   */
+  flattened_table(table_view const& flattened_columns_,
+                  std::vector<order> const& orders_,
+                  std::vector<null_order> const& null_orders_,
+                  std::vector<std::unique_ptr<column>>&& columns_,
+                  std::vector<rmm::device_buffer>&& null_masks_)
+    : _flattened_columns{flattened_columns_},
+      _orders{orders_},
+      _null_orders{null_orders_},
+      _columns{std::move(columns_)},
+      _superimposed_nullmasks{std::move(null_masks_)}
+  {
+  }
+
+  flattened_table() = default;
+
+  /**
+   * @brief Getter for the flattened columns, as a `table_view`.
+   */
+  table_view flattened_columns() const { return _flattened_columns; }
+
+  /**
+   * @brief Getter for the cudf::order of the table_view's columns.
+   */
+  std::vector<order> orders() const { return _orders; }
+
+  /**
+   * @brief Getter for the cudf::null_order of the table_view's columns.
+   */
+  std::vector<null_order> null_orders() const { return _null_orders; }
+
+  /**
+   * @brief Conversion to `table_view`, to fetch flattened columns.
+   */
+  operator table_view() const { return flattened_columns(); }
+
+ private:
+  table_view _flattened_columns;
+  std::vector<order> _orders;
+  std::vector<null_order> _null_orders;
+  std::vector<std::unique_ptr<column>> _columns;
+  std::vector<rmm::device_buffer> _superimposed_nullmasks;
+};
+
 /**
  * @brief Flatten table with struct columns to table with constituent columns of struct columns.
  *
@@ -74,17 +141,14 @@ bool is_or_has_nested_lists(cudf::column_view const& col);
  * @param null_precedence null order for input table
  * @param nullability force output to have nullability columns even if input columns
  * are all valid
- * @return tuple with flattened table, flattened column order, flattened null precedence,
- * vector of boolean columns (struct validity).
+ * @return `flatten_result` with flattened table, flattened column order, flattened null precedence,
+ * alongside the supporting columns and device_buffers for the flattened table.
  */
-std::tuple<table_view,
-           std::vector<order>,
-           std::vector<null_order>,
-           std::vector<std::unique_ptr<column>>>
-flatten_nested_columns(table_view const& input,
-                       std::vector<order> const& column_order,
-                       std::vector<null_order> const& null_precedence,
-                       column_nullability nullability = column_nullability::MATCH_INCOMING);
+flattened_table flatten_nested_columns(
+  table_view const& input,
+  std::vector<order> const& column_order,
+  std::vector<null_order> const& null_precedence,
+  column_nullability nullability = column_nullability::MATCH_INCOMING);
 
 /**
  * @brief Unflatten columns flattened as by `flatten_nested_columns()`,
@@ -156,6 +220,31 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Push down nulls from a parent mask into child columns, using bitwise AND,
+ * for all columns in the specified table.
+ *
+ * This function constructs a table_view containing a new column_view instance equivalent to
+ * every column_view in the specified table. Each column_view might contain possibly new
+ * child column_views, all with possibly new null mask values reflecting null rows from the
+ * parent column:
+ * 1. If the column is not STRUCT, the column is returned unmodified, with no new
+ *    supporting device_buffer instances.
+ * 2. If the column is STRUCT, the null masks of the parent and child are bitwise-ANDed, and a
+ *    modified column_view is returned. This applies recursively.
+ *
+ * @param table The table_view of (possibly STRUCT) columns whose nulls need to be pushed to its
+ * members.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr     Device memory resource used to allocate new device memory.
+ * @return A pair of:
+ *         1. table_view of columns with nulls pushed down to child columns, as appropriate.
+ *         2. Supporting device_buffer instances, for any newly constructed null masks.
+ */
+std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
+  table_view const& table,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 94c22911c1e..852f95be96b 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -23,42 +23,32 @@ namespace detail {
 
 namespace tdigest {
 
-// mean and weight column indices within tdigest inner struct columns
-constexpr size_type mean_column_index   = 0;
-constexpr size_type weight_column_index = 1;
-
-// min and max column indices within tdigest outer struct columns
-constexpr size_type centroid_column_index = 0;
-constexpr size_type min_column_index      = 1;
-constexpr size_type max_column_index      = 2;
-
 /**
- * @brief Verifies that the input column is a valid tdigest column.
- *
- * struct {
- *   // centroids for the digest
- *   list {
- *    struct {
- *      double    // mean
- *      double    // weight
- *    },
- *    ...
- *   }
- *   // these are from the input stream, not the centroids. they are used
- *   // during the percentile_approx computation near the beginning or
- *   // end of the quantiles
- *   double       // min
- *   double       // max
- * }
- *
- * Each output row is a single tdigest.  The length of the row is the "size" of the
- * tdigest, each element of which represents a weighted centroid (mean, weight).
+ * @brief Create a tdigest column from it's constituent components.
  *
- * @param col    Column to be checkeed
+ * @param num_rows The number of rows in the output column.
+ * @param centroid_means The inner means column.  These values are partitioned into lists by the
+ * `tdigest_offsets` column.
+ * @param centroid_weights The inner weights column.  These values are partitioned into lists by the
+ * `tdigest_offsets` column.
+ * @param tdigest_offsets Offsets representing each individual tdigest in the output column. The
+ * offsets partition the centroid means and weights.
+ * @param min_values Column representing the minimum input value for each tdigest.
+ * @param max_values Column representing the maximum input value for each tdigest.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @throws cudf::logic error if the column is not a valid tdigest column.
+ * @returns The constructed tdigest column.
  */
-void check_is_valid_tdigest_column(column_view const& col);
+std::unique_ptr<column> make_tdigest_column(
+  size_type num_rows,
+  std::unique_ptr<column>&& centroid_means,
+  std::unique_ptr<column>&& centroid_weights,
+  std::unique_ptr<column>&& tdigest_offsets,
+  std::unique_ptr<column>&& min_values,
+  std::unique_ptr<column>&& max_values,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create an empty tdigest column.
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 12948498455..7256cd65996 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -77,6 +77,17 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @copydoc cudf::one_hot_encode
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
+  column_view const& input,
+  column_view const& categories,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @copydoc cudf::mask_to_bools
  *
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index d4470048636..7524593e5ea 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 /**
- * @brief definition of the device operators
- * @file device_operators.cuh
+ * @brief Definition of the device operators
+ * @file
  */
 
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -31,12 +31,37 @@
 #include <type_traits>
 
 namespace cudf {
-// ------------------------------------------------------------------------
-// Binary operators
-/* @brief binary `sum` operator */
+namespace detail {
+
+/**
+ * @brief SFINAE enabled min function suitable for std::is_invocable
+ */
+template <typename LHS,
+          typename RHS,
+          std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
+CUDA_HOST_DEVICE_CALLABLE auto min(LHS const& lhs, RHS const& rhs)
+{
+  return std::min(lhs, rhs);
+}
+
+/**
+ * @brief SFINAE enabled max function suitable for std::is_invocable
+ */
+template <typename LHS,
+          typename RHS,
+          std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
+CUDA_HOST_DEVICE_CALLABLE auto max(LHS const& lhs, RHS const& rhs)
+{
+  return std::max(lhs, rhs);
+}
+}  // namespace detail
+
+/**
+ * @brief Binary `sum` operator
+ */
 struct DeviceSum {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs + rhs)
   {
     return lhs + rhs;
   }
@@ -63,7 +88,9 @@ struct DeviceSum {
   }
 };
 
-/* @brief `count` operator - used in rolling windows */
+/**
+ * @brief `count` operator - used in rolling windows
+ */
 struct DeviceCount {
   template <typename T, typename std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
   CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
@@ -84,12 +111,15 @@ struct DeviceCount {
   }
 };
 
-/* @brief binary `min` operator */
+/**
+ * @brief binary `min` operator
+ */
 struct DeviceMin {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+    -> decltype(cudf::detail::min(lhs, rhs))
   {
-    return std::min(lhs, rhs);
+    return cudf::detail::min(lhs, rhs);
   }
 
   template <
@@ -122,12 +152,15 @@ struct DeviceMin {
   }
 };
 
-/* @brief binary `max` operator */
+/**
+ * @brief binary `max` operator
+ */
 struct DeviceMax {
   template <typename T>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs)
+    -> decltype(cudf::detail::max(lhs, rhs))
   {
-    return std::max(lhs, rhs);
+    return cudf::detail::max(lhs, rhs);
   }
 
   template <
@@ -159,10 +192,12 @@ struct DeviceMax {
   }
 };
 
-/* @brief binary `product` operator */
+/**
+ * @brief binary `product` operator
+ */
 struct DeviceProduct {
   template <typename T, typename std::enable_if_t<!cudf::is_timestamp<T>()>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs * rhs)
   {
     return lhs * rhs;
   }
@@ -181,28 +216,34 @@ struct DeviceProduct {
   }
 };
 
-/* @brief binary `and` operator */
+/**
+ * @brief binary `and` operator
+ */
 struct DeviceAnd {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs & rhs)
   {
     return (lhs & rhs);
   }
 };
 
-/* @brief binary `or` operator */
+/**
+ * @brief binary `or` operator
+ */
 struct DeviceOr {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs | rhs)
   {
     return (lhs | rhs);
   }
 };
 
-/* @brief binary `xor` operator */
+/**
+ * @brief binary `xor` operator
+ */
 struct DeviceXor {
   template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  CUDA_HOST_DEVICE_CALLABLE T operator()(const T& lhs, const T& rhs)
+  CUDA_HOST_DEVICE_CALLABLE auto operator()(const T& lhs, const T& rhs) -> decltype(lhs ^ rhs)
   {
     return (lhs ^ rhs);
   }
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
index 65deadd6cd0..ebb21492be9 100644
--- a/cpp/include/cudf/detail/utilities/hash_functions.cuh
+++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh
@@ -21,110 +21,27 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
-#include <hash/hash_constants.hpp>
 
 using hash_value_type = uint32_t;
 
 namespace cudf {
 namespace detail {
-namespace {
-/**
- * @brief Core MD5 algorithm implementation. Processes a single 512-bit chunk,
- * updating the hash value so far. Does not zero out the buffer contents.
- */
-void CUDA_DEVICE_CALLABLE md5_hash_step(md5_intermediate_data* hash_state)
-{
-  uint32_t A = hash_state->hash_value[0];
-  uint32_t B = hash_state->hash_value[1];
-  uint32_t C = hash_state->hash_value[2];
-  uint32_t D = hash_state->hash_value[3];
-
-  for (unsigned int j = 0; j < 64; j++) {
-    uint32_t F;
-    uint32_t g;
-    switch (j / 16) {
-      case 0:
-        F = (B & C) | ((~B) & D);
-        g = j;
-        break;
-      case 1:
-        F = (D & B) | ((~D) & C);
-        g = (5 * j + 1) % 16;
-        break;
-      case 2:
-        F = B ^ C ^ D;
-        g = (3 * j + 5) % 16;
-        break;
-      case 3:
-        F = C ^ (B | (~D));
-        g = (7 * j) % 16;
-        break;
-    }
-
-    uint32_t buffer_element_as_int;
-    std::memcpy(&buffer_element_as_int, hash_state->buffer + g * 4, 4);
-    F = F + A + md5_hash_constants[j] + buffer_element_as_int;
-    A = D;
-    D = C;
-    C = B;
-    B = B + __funnelshift_l(F, F, md5_shift_constants[((j / 16) * 4) + (j % 4)]);
-  }
-
-  hash_state->hash_value[0] += A;
-  hash_state->hash_value[1] += B;
-  hash_state->hash_value[2] += C;
-  hash_state->hash_value[3] += D;
-
-  hash_state->buffer_length = 0;
-}
 
 /**
- * @brief Core MD5 element processing function
+ * Normalization of floating point NaNs and zeros, passthrough for all other values.
  */
-template <typename TKey>
-void CUDA_DEVICE_CALLABLE md5_process(TKey const& key, md5_intermediate_data* hash_state)
+template <typename T>
+T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros(T const& key)
 {
-  uint32_t const len  = sizeof(TKey);
-  uint8_t const* data = reinterpret_cast<uint8_t const*>(&key);
-  hash_state->message_length += len;
-
-  // 64 bytes for the number of byt es processed in a given step
-  constexpr int md5_chunk_size = 64;
-  if (hash_state->buffer_length + len < md5_chunk_size) {
-    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
-    hash_state->buffer_length += len;
-  } else {
-    uint32_t copylen = md5_chunk_size - hash_state->buffer_length;
-
-    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
-    md5_hash_step(hash_state);
-
-    while (len > md5_chunk_size + copylen) {
-      std::memcpy(hash_state->buffer, data + copylen, md5_chunk_size);
-      md5_hash_step(hash_state);
-      copylen += md5_chunk_size;
+  if constexpr (is_floating_point<T>()) {
+    if (isnan(key)) {
+      return std::numeric_limits<T>::quiet_NaN();
+    } else if (key == T{0.0}) {
+      return T{0.0};
     }
-
-    std::memcpy(hash_state->buffer, data + copylen, len - copylen);
-    hash_state->buffer_length = len - copylen;
-  }
-}
-
-/**
- * Normalization of floating point NANs and zeros helper
- */
-template <typename T, std::enable_if_t<std::is_floating_point<T>::value>* = nullptr>
-T CUDA_DEVICE_CALLABLE normalize_nans_and_zeros_helper(T key)
-{
-  if (isnan(key)) {
-    return std::numeric_limits<T>::quiet_NaN();
-  } else if (key == T{0.0}) {
-    return T{0.0};
-  } else {
-    return key;
   }
+  return key;
 }
-}  // namespace
 
 /**
  * Modified GPU implementation of
@@ -149,217 +66,6 @@ void CUDA_DEVICE_CALLABLE uint32ToLowercaseHexString(uint32_t num, char* destina
   std::memcpy(destination, reinterpret_cast<uint8_t*>(&x), 8);
 }
 
-struct MD5ListHasher {
-  template <typename T, std::enable_if_t<is_chrono<T>()>* = nullptr>
-  void __device__ operator()(column_device_view data_col,
-                             size_type offset_begin,
-                             size_type offset_end,
-                             md5_intermediate_data* hash_state) const
-  {
-    cudf_assert(false && "MD5 Unsupported chrono type column");
-  }
-
-  template <typename T, std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
-  void __device__ operator()(column_device_view data_col,
-                             size_type offset_begin,
-                             size_type offset_end,
-                             md5_intermediate_data* hash_state) const
-  {
-    cudf_assert(false && "MD5 Unsupported non-fixed-width type column");
-  }
-
-  template <typename T, std::enable_if_t<is_floating_point<T>()>* = nullptr>
-  void __device__ operator()(column_device_view data_col,
-                             size_type offset_begin,
-                             size_type offset_end,
-                             md5_intermediate_data* hash_state) const
-  {
-    for (int i = offset_begin; i < offset_end; i++) {
-      if (!data_col.is_null(i)) {
-        md5_process(normalize_nans_and_zeros_helper<T>(data_col.element<T>(i)), hash_state);
-      }
-    }
-  }
-
-  template <
-    typename T,
-    std::enable_if_t<is_fixed_width<T>() && !is_floating_point<T>() && !is_chrono<T>()>* = nullptr>
-  void CUDA_DEVICE_CALLABLE operator()(column_device_view data_col,
-                                       size_type offset_begin,
-                                       size_type offset_end,
-                                       md5_intermediate_data* hash_state) const
-  {
-    for (int i = offset_begin; i < offset_end; i++) {
-      if (!data_col.is_null(i)) md5_process(data_col.element<T>(i), hash_state);
-    }
-  }
-};
-
-template <>
-void CUDA_DEVICE_CALLABLE
-MD5ListHasher::operator()<string_view>(column_device_view data_col,
-                                       size_type offset_begin,
-                                       size_type offset_end,
-                                       md5_intermediate_data* hash_state) const
-{
-  for (int i = offset_begin; i < offset_end; i++) {
-    if (!data_col.is_null(i)) {
-      string_view key     = data_col.element<string_view>(i);
-      uint32_t const len  = static_cast<uint32_t>(key.size_bytes());
-      uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());
-
-      hash_state->message_length += len;
-
-      if (hash_state->buffer_length + len < 64) {
-        std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
-        hash_state->buffer_length += len;
-      } else {
-        uint32_t copylen = 64 - hash_state->buffer_length;
-        std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
-        md5_hash_step(hash_state);
-
-        while (len > 64 + copylen) {
-          std::memcpy(hash_state->buffer, data + copylen, 64);
-          md5_hash_step(hash_state);
-          copylen += 64;
-        }
-
-        std::memcpy(hash_state->buffer, data + copylen, len - copylen);
-        hash_state->buffer_length = len - copylen;
-      }
-    }
-  }
-}
-
-struct MD5Hash {
-  MD5Hash() = default;
-  constexpr MD5Hash(uint32_t seed) : m_seed(seed) {}
-
-  void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const
-  {
-    auto const full_length = (static_cast<uint64_t>(hash_state->message_length)) << 3;
-    thrust::fill_n(thrust::seq, hash_state->buffer + hash_state->buffer_length, 1, 0x80);
-
-    // 64 bytes for the number of bytes processed in a given step
-    constexpr int md5_chunk_size = 64;
-    // 8 bytes for the total message length, appended to the end of the last chunk processed
-    constexpr int message_length_size = 8;
-    // 1 byte for the end of the message flag
-    constexpr int end_of_message_size = 1;
-    if (hash_state->buffer_length + message_length_size + end_of_message_size <= md5_chunk_size) {
-      thrust::fill_n(
-        thrust::seq,
-        hash_state->buffer + hash_state->buffer_length + 1,
-        (md5_chunk_size - message_length_size - end_of_message_size - hash_state->buffer_length),
-        0x00);
-    } else {
-      thrust::fill_n(thrust::seq,
-                     hash_state->buffer + hash_state->buffer_length + 1,
-                     (md5_chunk_size - hash_state->buffer_length),
-                     0x00);
-      md5_hash_step(hash_state);
-
-      thrust::fill_n(thrust::seq, hash_state->buffer, md5_chunk_size - message_length_size, 0x00);
-    }
-
-    std::memcpy(hash_state->buffer + md5_chunk_size - message_length_size,
-                reinterpret_cast<uint8_t const*>(&full_length),
-                message_length_size);
-    md5_hash_step(hash_state);
-
-#pragma unroll
-    for (int i = 0; i < 4; ++i)
-      uint32ToLowercaseHexString(hash_state->hash_value[i], result_location + (8 * i));
-  }
-
-  template <typename T, std::enable_if_t<is_chrono<T>()>* = nullptr>
-  void __device__ operator()(column_device_view col,
-                             size_type row_index,
-                             md5_intermediate_data* hash_state) const
-  {
-    cudf_assert(false && "MD5 Unsupported chrono type column");
-  }
-
-  template <typename T, std::enable_if_t<!is_fixed_width<T>()>* = nullptr>
-  void __device__ operator()(column_device_view col,
-                             size_type row_index,
-                             md5_intermediate_data* hash_state) const
-  {
-    cudf_assert(false && "MD5 Unsupported non-fixed-width type column");
-  }
-
-  template <typename T, std::enable_if_t<is_floating_point<T>()>* = nullptr>
-  void __device__ operator()(column_device_view col,
-                             size_type row_index,
-                             md5_intermediate_data* hash_state) const
-  {
-    md5_process(normalize_nans_and_zeros_helper<T>(col.element<T>(row_index)), hash_state);
-  }
-
-  template <
-    typename T,
-    std::enable_if_t<is_fixed_width<T>() && !is_floating_point<T>() && !is_chrono<T>()>* = nullptr>
-  void CUDA_DEVICE_CALLABLE operator()(column_device_view col,
-                                       size_type row_index,
-                                       md5_intermediate_data* hash_state) const
-  {
-    md5_process(col.element<T>(row_index), hash_state);
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-};
-
-template <>
-void CUDA_DEVICE_CALLABLE MD5Hash::operator()<string_view>(column_device_view col,
-                                                           size_type row_index,
-                                                           md5_intermediate_data* hash_state) const
-{
-  string_view key     = col.element<string_view>(row_index);
-  uint32_t const len  = static_cast<uint32_t>(key.size_bytes());
-  uint8_t const* data = reinterpret_cast<uint8_t const*>(key.data());
-
-  hash_state->message_length += len;
-
-  if (hash_state->buffer_length + len < 64) {
-    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, len);
-    hash_state->buffer_length += len;
-  } else {
-    uint32_t copylen = 64 - hash_state->buffer_length;
-    std::memcpy(hash_state->buffer + hash_state->buffer_length, data, copylen);
-    md5_hash_step(hash_state);
-
-    while (len > 64 + copylen) {
-      std::memcpy(hash_state->buffer, data + copylen, 64);
-      md5_hash_step(hash_state);
-      copylen += 64;
-    }
-
-    std::memcpy(hash_state->buffer, data + copylen, len - copylen);
-    hash_state->buffer_length = len - copylen;
-  }
-}
-
-template <>
-void CUDA_DEVICE_CALLABLE MD5Hash::operator()<list_view>(column_device_view col,
-                                                         size_type row_index,
-                                                         md5_intermediate_data* hash_state) const
-{
-  static constexpr size_type offsets_column_index{0};
-  static constexpr size_type data_column_index{1};
-
-  column_device_view offsets = col.child(offsets_column_index);
-  column_device_view data    = col.child(data_column_index);
-
-  if (data.type().id() == type_id::LIST) cudf_assert(false && "Nested list unsupported");
-
-  cudf::type_dispatcher(data.type(),
-                        MD5ListHasher{},
-                        data,
-                        offsets.element<size_type>(row_index),
-                        offsets.element<size_type>(row_index + 1),
-                        hash_state);
-}
 }  // namespace detail
 }  // namespace cudf
 
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 9a25a03c1c6..aff0d20a467 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -202,5 +202,34 @@ std::unique_ptr<column> sequence(
   scalar const& init,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
+ * each successive element, i.e., `output[i] = init + i * months` for `i` in `[0, size)`.
+ *
+ * If a given date is invalid, the date is scaled back to the last available day of that month.
+ *
+ * Example:
+ * ```
+ * size = 3
+ * init = 2020-01-31 08:00:00
+ * months = 1
+ * return = [2020-01-31 08:00:00, 2020-02-29 08:00:00, 2020-03-31 08:00:00]
+ * ```
+ *
+ * @throw cudf::logic_error if input datatype is not a TIMESTAMP
+ *
+ * @param size Number of timestamps to generate
+ * @param init The initial timestamp
+ * @param months Months to increment
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns Timestamps column with sequences of months.
+ */
+std::unique_ptr<cudf::column> calendrical_month_sequence(
+  size_type size,
+  scalar const& init,
+  size_type months,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 27eadf94fb6..42421aed716 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <future>
 #include <memory>
 #include <string>
 #include <vector>
@@ -132,6 +133,34 @@ class data_sink {
     CUDF_FAIL("data_sink classes that support device_write must override it.");
   }
 
+  /**
+   * @brief Asynchronously append the buffer content to the sink from a gpu address
+   *
+   * For optimal performance, should only be called when `is_device_write_preferred` returns `true`.
+   * Data sink implementations that don't support direct device writes don't need to override
+   * this function.
+   *
+   * `gpu_data` must not be freed until this call is synchronized.
+   * @code{.pseudo}
+   * auto result = device_write_async(gpu_data, size, stream);
+   * result.wait(); // OR result.get()
+   * @endcode
+   *
+   * @throws cudf::logic_error the object does not support direct device writes, i.e.
+   * `supports_device_write` returns `false`.
+   * @throws cudf::logic_error
+   *
+   * @param gpu_data Pointer to the buffer to be written into the sink object
+   * @param size Number of bytes to write
+   * @param stream CUDA stream to use
+   */
+  virtual std::future<void> device_write_async(void const* gpu_data,
+                                               size_t size,
+                                               rmm::cuda_stream_view stream)
+  {
+    CUDF_FAIL("data_sink classes that support device_write_async must override it.");
+  }
+
   /**
    * @brief Flush the data written into the sink
    */
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 89e589d306a..aac44bed50e 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -24,55 +24,21 @@ namespace cudf {
 namespace io {
 namespace detail {
 namespace csv {
+
 /**
- * @brief Class to read CSV dataset data into columns.
+ * @brief Reads the entire dataset.
+ *
+ * @param sources Input `datasource` object to read the dataset from
+ * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return The set of columns along with table metadata
  */
-class reader {
- private:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Constructor from an array of datasources
-   *
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-                  csv_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~reader();
-
-  /**
-   * @brief Reads the entire dataset.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with table metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-};
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 class writer {
  public:
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index e6d8f2de483..7ab8906e5a9 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -25,67 +25,26 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-// Forward declarations
-namespace arrow {
-namespace io {
-class RandomAccessFile;
-}
-}  // namespace arrow
-
 namespace cudf {
 namespace io {
 namespace detail {
 namespace json {
 
 /**
- * @brief Class to read JSON dataset data into columns.
+ * @brief Reads and returns the entire data set.
+ *
+ * @param[in] sources Input `datasource` objects to read the dataset from
+ * @param[in] options Settings for controlling reading behavior
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
+ * @param[in] mr Device memory resource to use for device memory allocation
+ *
+ * @return cudf::table object that contains the array of cudf::column.
  */
-class reader {
- private:
-  class impl;
-  std::unique_ptr<impl> _impl;
-
- public:
-  /**
-   * @brief Constructor from an array of file paths
-   *
-   * @param filepaths Paths to the files containing the input dataset
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::string> const& filepaths,
-                  json_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Constructor from an array of datasources
-   *
-   * @param sources Input `datasource` objects to read the dataset from
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-                  json_reader_options const& options,
-                  rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Destructor explicitly-declared to avoid inlined in header
-   */
-  ~reader();
-
-  /*
-   * @brief Reads and returns the entire data set.
-   *
-   * @param[in] options Settings for controlling reading behavior
-   * @return cudf::table object that contains the array of cudf::column.
-   */
-  table_with_metadata read(json_reader_options const& options,
-                           rmm::cuda_stream_view stream = rmm::cuda_stream_default);
-};
+table_with_metadata read_json(
+  std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
+  json_reader_options const& options,
+  rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace json
 }  // namespace detail
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 6f2a97338df..2a95b85465b 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -34,6 +34,10 @@ namespace io {
  * @file
  */
 
+constexpr size_t default_stripe_size_bytes   = 64 * 1024 * 1024;
+constexpr size_type default_stripe_size_rows = 1000000;
+constexpr size_type default_row_index_stride = 10000;
+
 /**
  * @brief Builds settings to use for `read_orc()`.
  */
@@ -386,6 +390,12 @@ class orc_writer_options {
   compression_type _compression = compression_type::AUTO;
   // Enable writing column statistics
   bool _enable_statistics = true;
+  // Maximum size of each stripe (unless smaller than a single row group)
+  size_t _stripe_size_bytes = default_stripe_size_bytes;
+  // Maximum number of rows in stripe (unless smaller than a single row group)
+  size_type _stripe_size_rows = default_stripe_size_rows;
+  // Row index stride (maximum number of rows in each row group)
+  size_type _row_index_stride = default_row_index_stride;
   // Set of columns to output
   table_view _table;
   // Optional associated metadata
@@ -437,6 +447,25 @@ class orc_writer_options {
    */
   bool enable_statistics() const { return _enable_statistics; }
 
+  /**
+   * @brief Returns maximum stripe size, in bytes.
+   */
+  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+
+  /**
+   * @brief Returns maximum stripe size, in rows.
+   */
+  auto stripe_size_rows() const { return _stripe_size_rows; }
+
+  /**
+   * @brief Returns the row index stride.
+   */
+  auto row_index_stride() const
+  {
+    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    return unaligned_stride - unaligned_stride % 8;
+  }
+
   /**
    * @brief Returns table to be written to output.
    */
@@ -463,6 +492,38 @@ class orc_writer_options {
    */
   void enable_statistics(bool val) { _enable_statistics = val; }
 
+  /**
+   * @brief Sets the maximum stripe size, in bytes.
+   */
+  void set_stripe_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
+    _stripe_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum stripe size, in rows.
+   *
+   * If the stripe size is smaller that the row group size, row group size will be reduced to math
+   * the stripe size.
+   */
+  void set_stripe_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(size_rows >= 512, "Maximum stripe size cannot be smaller than 512");
+    _stripe_size_rows = size_rows;
+  }
+
+  /**
+   * @brief Sets the row index stride.
+   *
+   * Rounded down to a multiple of 8.
+   */
+  void set_row_index_stride(size_type stride)
+  {
+    CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
+    _row_index_stride = stride;
+  }
+
   /**
    * @brief Sets table to be written to output.
    *
@@ -523,6 +584,42 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum stripe size, in bytes.
+   *
+   * @param val maximum stripe size
+   * @return this for chaining.
+   */
+  orc_writer_options_builder& stripe_size_bytes(size_t val)
+  {
+    options.set_stripe_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output stripes.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  orc_writer_options_builder& stripe_size_rows(size_type val)
+  {
+    options.set_stripe_size_rows(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the row index stride.
+   *
+   * @param val new row index stride
+   * @return this for chaining.
+   */
+  orc_writer_options_builder& row_index_stride(size_type val)
+  {
+    options.set_row_index_stride(val);
+    return *this;
+  }
+
   /**
    * @brief Sets table to be written to output.
    *
@@ -594,6 +691,12 @@ class chunked_orc_writer_options {
   compression_type _compression = compression_type::AUTO;
   // Enable writing column statistics
   bool _enable_statistics = true;
+  // Maximum size of each stripe (unless smaller than a single row group)
+  size_t _stripe_size_bytes = default_stripe_size_bytes;
+  // Maximum number of rows in stripe (unless smaller than a single row group)
+  size_type _stripe_size_rows = default_stripe_size_rows;
+  // Row index stride (maximum number of rows in each row group)
+  size_type _row_index_stride = default_row_index_stride;
   // Optional associated metadata
   const table_input_metadata* _metadata = nullptr;
 
@@ -638,6 +741,25 @@ class chunked_orc_writer_options {
    */
   bool enable_statistics() const { return _enable_statistics; }
 
+  /**
+   * @brief Returns maximum stripe size, in bytes.
+   */
+  auto stripe_size_bytes() const { return _stripe_size_bytes; }
+
+  /**
+   * @brief Returns maximum stripe size, in rows.
+   */
+  auto stripe_size_rows() const { return _stripe_size_rows; }
+
+  /**
+   * @brief Returns the row index stride.
+   */
+  auto row_index_stride() const
+  {
+    auto const unaligned_stride = std::min(_row_index_stride, stripe_size_rows());
+    return unaligned_stride - unaligned_stride % 8;
+  }
+
   /**
    * @brief Returns associated metadata.
    */
@@ -659,6 +781,38 @@ class chunked_orc_writer_options {
    */
   void enable_statistics(bool val) { _enable_statistics = val; }
 
+  /**
+   * @brief Sets the maximum stripe size, in bytes.
+   */
+  void set_stripe_size_bytes(size_t size_bytes)
+  {
+    CUDF_EXPECTS(size_bytes >= 64 << 10, "64KB is the minimum stripe size");
+    _stripe_size_bytes = size_bytes;
+  }
+
+  /**
+   * @brief Sets the maximum stripe size, in rows.
+   *
+   * If the stripe size is smaller that the row group size, row group size will be reduced to math
+   * the stripe size.
+   */
+  void set_stripe_size_rows(size_type size_rows)
+  {
+    CUDF_EXPECTS(size_rows >= 512, "maximum stripe size cannot be smaller than 512");
+    _stripe_size_rows = size_rows;
+  }
+
+  /**
+   * @brief Sets the row index stride.
+   *
+   * Rounded down to a multiple of 8.
+   */
+  void set_row_index_stride(size_type stride)
+  {
+    CUDF_EXPECTS(stride >= 512, "Row index stride cannot be smaller than 512");
+    _row_index_stride = stride;
+  }
+
   /**
    * @brief Sets associated metadata.
    *
@@ -709,6 +863,42 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets the maximum stripe size, in bytes.
+   *
+   * @param val maximum stripe size
+   * @return this for chaining.
+   */
+  chunked_orc_writer_options_builder& stripe_size_bytes(size_t val)
+  {
+    options.set_stripe_size_bytes(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the maximum number of rows in output stripes.
+   *
+   * @param val maximum number or rows
+   * @return this for chaining.
+   */
+  chunked_orc_writer_options_builder& stripe_size_rows(size_type val)
+  {
+    options.set_stripe_size_rows(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets the row index stride.
+   *
+   * @param val new row index stride
+   * @return this for chaining.
+   */
+  chunked_orc_writer_options_builder& row_index_stride(size_type val)
+  {
+    options.set_row_index_stride(val);
+    return *this;
+  }
+
   /**
    * @brief Sets associated metadata.
    *
diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
index 53b31015145..8cde8c1708c 100644
--- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
+++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
@@ -15,25 +15,45 @@
  */
 #pragma once
 
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/lists/drop_list_duplicates.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace lists {
 namespace detail {
+/**
+ * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
+ *                                            lists_column_view const&,
+ *                                            duplicate_keep_option,
+ *                                            null_equality,
+ *                                            nan_equality,
+ *                                            rmm::mr::device_memory_resource*)
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> drop_list_duplicates(
+  lists_column_view const& keys,
+  lists_column_view const& values,
+  duplicate_keep_option keep_option,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @copydoc cudf::lists::drop_list_duplicates
- *
+ * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
+ *                                            null_equality,
+ *                                            nan_equality,
+ *                                            rmm::mr::device_memory_resource*)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> drop_list_duplicates(
-  lists_column_view const& lists_column,
+  lists_column_view const& input,
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index f68ff872020..1068a4c4b69 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -34,6 +34,19 @@ std::unique_ptr<column> sort_lists(
   null_order null_precedence,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::lists::stable_sort_lists
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> stable_sort_lists(
+  lists_column_view const& input,
+  order column_order,
+  null_order null_precedence,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
index e778428510d..baec8dfdc9f 100644
--- a/cpp/include/cudf/lists/drop_list_duplicates.hpp
+++ b/cpp/include/cudf/lists/drop_list_duplicates.hpp
@@ -28,35 +28,87 @@ namespace lists {
  */
 
 /**
- * @brief Create a new lists column by extracting unique entries from list elements in the given
- * lists column.
- *
- * Given an input lists column, the list elements in the column are copied to an output lists
- * column such that their duplicated entries are dropped out to keep only the unique ones. The
- * order of those entries within each list are not guaranteed to be preserved as in the input. In
- * the current implementation, entries in the output lists are sorted by ascending order (nulls
- * last), but this is not guaranteed in future implementation.
- *
- * @throw cudf::logic_error if the child column of the input lists column contains nested type other
- * than struct.
- *
- * @param lists_column The input lists column to extract lists with unique entries.
- * @param nulls_equal Flag to specify whether null entries should be considered equal.
- * @param nans_equal Flag to specify whether NaN entries should be considered as equal value (only
- *        applicable for floating point data column).
+ * @brief Copy the elements from the lists in `keys` and associated `values` columns according to
+ * the unique elements in `keys`.
+ *
+ * For each list in `keys` and associated `values`, according to the parameter `keep_option`, copy
+ * the unique elements from the list in `keys` and their corresponding elements in `values` to new
+ * lists. Order of the output elements within each list are not guaranteed to be preserved as in the
+ * input.
+ *
+ * Behavior is undefined if `count_elements(keys)[i] != count_elements(values)[i]` for all `i` in
+ * `[0, keys.size())`.
+ *
+ * @throw cudf::logic_error If the child column of the input keys column contains nested type other
+ *        than STRUCT.
+ * @throw cudf::logic_error If `keys.size() != values.size()`.
+ *
+ * @param keys The input keys lists column to check for uniqueness and copy unique elements.
+ * @param values The values lists column in which the elements are mapped to elements in the key
+ *        column.
+ * @param nulls_equal Flag to specify whether null key elements should be considered as equal.
+ * @param nans_equal Flag to specify whether NaN key elements should be considered as equal
+ *        (only applicable for floating point keys elements).
+ * @param keep_option Flag to specify which elements will be copied from the input to the output.
  * @param mr Device resource used to allocate memory.
  *
  * @code{.pseudo}
- * input  = { {1, 1, 2, 1, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
- * output = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
+ * keys   = { {1,   1,   2,   3},   {4},   NULL, {}, {NULL, NULL, NULL, 5,   6,   6,   6,   5} }
+ * values = { {"a", "b", "c", "d"}, {"e"}, NULL, {}, {"N0", "N1", "N2", "f", "g", "h", "i", "j"} }
+ *
+ * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_FIRST)
+ * out_keys   = { {1,   2,   3},   {4},   NULL, {}, {5,   6,   NULL} }
+ * out_values = { {"a", "c", "d"}, {"e"}, NULL, {}, {"f", "g", "N0"} }
+ *
+ * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_LAST)
+ * out_keys   = { {1,   2,   3},   {4},   NULL, {}, {5,   6,   NULL} }
+ * out_values = { {"b", "c", "d"}, {"e"}, NULL, {}, {"j", "i", "N2"} }
  *
- * Note that permuting the entries of each list in this output also produces another valid output.
+ * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_NONE)
+ * out_keys   = { {2,   3},   {4},   NULL, {}, {} }
+ * out_values = { {"c", "d"}, {"e"}, NULL, {}, {} }
+ * @endcode
+ *
+ * @return A pair of lists columns storing the results from extracting unique key elements and their
+ * corresponding values elements from the input.
+ */
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
+  lists_column_view const& keys,
+  lists_column_view const& values,
+  duplicate_keep_option keep_option   = duplicate_keep_option::KEEP_FIRST,
+  null_equality nulls_equal           = null_equality::EQUAL,
+  nan_equality nans_equal             = nan_equality::UNEQUAL,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create a new list column by copying elements from the input lists column ignoring
+ * duplicate list elements.
+ *
+ * Given a lists column, an output lists column is generated by copying elements from the input
+ * lists column in a way such that the duplicate elements in each list are ignored, producing only
+ * unique list elements.
+ *
+ * Order of the output elements are not guaranteed to be preserved as in the input.
+ *
+ * @throw cudf::logic_error If the child column of the input lists column contains nested type other
+ *        than STRUCT.
+ *
+ * @param input The input lists column to check and copy unique elements.
+ * @param nulls_equal Flag to specify whether null key elements should be considered as equal.
+ * @param nans_equal Flag to specify whether NaN key elements should be considered as equal
+ *        (only applicable for floating point keys column).
+ * @param keep_option Flag to specify which elements will be copied from the input to the output.
+ * @param mr Device resource used to allocate memory.
+ *
+ * @code{.pseudo}
+ * input  = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
+ * drop_list_duplicates(input) = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
  * @endcode
  *
- * @return A lists column with list elements having unique entries.
+ * @return A lists column storing the results from extracting unique list elements from the input.
  */
 std::unique_ptr<column> drop_list_duplicates(
-  lists_column_view const& lists_column,
+  lists_column_view const& input,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::UNEQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 54db1ab8f7f..0042d0e00e1 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -28,8 +28,8 @@ namespace lists {
  */
 
 /**
- * @brief Create a column using values from row `index` from each
- * sublist within the input `lists_column`.
+ * @brief Create a column where each row is the element at position `index` from the corresponding
+ * sublist in the input `lists_column`.
  *
  * Output `column[i]` is set from element `lists_column[i][index]`.
  * If `index` is larger than the size of the sublist at `lists_column[i]`
@@ -65,6 +65,45 @@ std::unique_ptr<column> extract_list_element(
   size_type index,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create a column where each row is a single element from the corresponding sublist
+ * in the input `lists_column`, selected using indices from the `indices` column.
+ *
+ * Output `column[i]` is set from element `lists_column[i][indices[i]]`.
+ * If `indices[i]` is larger than the size of the sublist at `lists_column[i]`
+ * then output `column[i] = null`.
+ * Similarly, if `indices[i]` is `null`, then `column[i] = null`.
+ *
+ * @code{.pseudo}
+ * l = { {1, 2, 3}, {4}, {5, 6} }
+ * r = extract_list_element(l, {0, null, 2})
+ * r is now {1, null, null}
+ * @endcode
+ *
+ * `indices[i]` may also be negative, in which case the row retrieved is offset
+ * from the end of each sublist.
+ *
+ * @code{.pseudo}
+ * l = { {"a"}, {"b", "c"}, {"d", "e", "f"} }
+ * r = extract_list_element(l, {-1, -2, -4})
+ * r is now {"a", "b", null}
+ * @endcode
+ *
+ * Any input where `lists_column[i] == null` produces output `column[i] = null`.
+ * Any input where `lists_column[i][indices[i]] == null` produces output `column[i] = null`.
+ *
+ * @param lists_column Column to extract elements from.
+ * @param indices The column whose rows indicate the element index to be retrieved from each list
+ * row.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return Column of extracted elements.
+ * @throws cudf::logic_error If the sizes of `lists_column` and `indices` do not match.
+ */
+std::unique_ptr<column> extract_list_element(
+  lists_column_view const& lists_column,
+  column_view const& indices,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index e27f3d03d86..55fd722ca14 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -54,6 +54,18 @@ std::unique_ptr<column> sort_lists(
   null_order null_precedence,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Segmented sort of the elements within a list in each row of a list column using stable
+ * sort.
+ *
+ * @copydoc cudf::lists::sort_lists
+ */
+std::unique_ptr<column> stable_sort_lists(
+  lists_column_view const& source_column,
+  order column_order,
+  null_order null_precedence,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 092a44a9b04..6aa72de8bc7 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/types.hpp>
 
 namespace cudf {
@@ -121,7 +121,7 @@ std::unique_ptr<table> quantiles(
  * @returns LIST Column containing requested percentile values as FLOAT64.
  */
 std::unique_ptr<column> percentile_approx(
-  structs_column_view const& input,
+  tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 2c9c9c64a64..c17abe8267d 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -211,6 +211,18 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns sorted order after stably sorting each segment in the table.
+ *
+ * @copydoc cudf::segmented_sorted_order
+ */
+std::unique_ptr<column> stable_segmented_sorted_order(
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Performs a lexicographic segmented sort of a table
  *
@@ -241,5 +253,18 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<null_order> const& null_precedence = {},
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Performs a stably lexicographic segmented sort of a table
+ *
+ * @copydoc cudf::segmented_sort_by_key
+ */
+std::unique_ptr<table> stable_segmented_sort_by_key(
+  table_view const& values,
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order         = {},
+  std::vector<null_order> const& null_precedence = {},
+  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 192be4fb6a9..7551511d281 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -208,9 +208,9 @@ std::unique_ptr<table> apply_boolean_mask(
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
  */
 enum class duplicate_keep_option {
-  KEEP_FIRST = 0,  ///< Keeps first duplicate row and unique rows
-  KEEP_LAST,       ///< Keeps last  duplicate row and unique rows
-  KEEP_NONE        ///< Keeps only unique rows are kept
+  KEEP_FIRST = 0,  ///< Keeps first duplicate element and unique elements
+  KEEP_LAST,       ///< Keeps last duplicate element and unique elements
+  KEEP_NONE        ///< Keeps only unique elements
 };
 
 /**
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 604756b5d09..dbf8ef54e3e 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -91,6 +91,33 @@ std::unique_ptr<column> title(
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Checks if the strings in the input column are title formatted.
+ *
+ * The first character of each word should be upper-case while all other
+ * characters should be lower-case. A word is a sequence of upper-case
+ * and lower-case characters.
+ *
+ * This function returns a column of booleans indicating true if the string in
+ * the input row is in title format and false if not.
+ *
+ * @code{.pseudo}
+ * Example:
+ * input = ["   Test1", "A Test", " Another test ", "N2Vidia Corp", "!Abc"];
+ * output = is_title(input)
+ * output is [true, true, false, true, true]
+ * @endcode
+ *
+ * Any null string entries result in corresponding null output column entries.
+ *
+ * @param input String column.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Column of type BOOL8.
+ */
+std::unique_ptr<column> is_title(
+  strings_column_view const& input,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index a650fdc239a..9f408a40314 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 namespace cudf {
@@ -44,12 +45,14 @@ namespace strings {
  *
  * @param strings Strings instance for this operation.
  * @param pattern Regex pattern to match to each string.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of boolean results for each string.
  */
 std::unique_ptr<column> contains_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -69,12 +72,14 @@ std::unique_ptr<column> contains_re(
  *
  * @param strings Strings instance for this operation.
  * @param pattern Regex pattern to match to each string.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column of boolean results for each string.
  */
 std::unique_ptr<column> matches_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,12 +99,14 @@ std::unique_ptr<column> matches_re(
  *
  * @param strings Strings instance for this operation.
  * @param pattern Regex pattern to match within each string.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New INT32 column with counts for each string.
  */
 std::unique_ptr<column> count_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
new file mode 100644
index 00000000000..ec22186ea99
--- /dev/null
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_convert
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Convert a list column of strings into a formatted strings column.
+ *
+ * The `separators` column should contain 3 strings elements in the following order:
+ * - element separator (default is comma `,`)
+ * - left-hand enclosure (default is `[`)
+ * - right-hand enclosure (default is `]`)
+ *
+ * @code{.pseudo}
+ * l1 = { [[a,b,c], [d,e]], [[f,g], [h]] }
+ * s1 = format_list_column(l1)
+ * s1 is now ["[[a,b,c],[d,e]]", "[[f,g],[h]]"]
+ *
+ * l2 = { [[a,b,c], [d,e]], [NULL], [[f,g], NULL, [h]] }
+ * s2 = format_list_column(l1, '-', [':', '{', '}'])
+ * s2 is now ["{{a:b:c}:{d:e}}", "{-}", "{{f:g}:-:{h}}"]
+ * @endcode
+ *
+ * @throw cudf::logic_error if the input column is not a LIST type with a STRING child.
+ *
+ * @param input Lists column to format.
+ * @param na_rep Replacment string for null elements.
+ * @param separator Strings to use for enclosing list components and separating elements.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ * @return New strings column.
+ */
+std::unique_ptr<column> format_list_column(
+  lists_column_view const& input,
+  string_scalar const& na_rep           = string_scalar("NULL"),
+  strings_column_view const& separators = strings_column_view(column_view{
+    data_type{type_id::STRING}, 0, nullptr}),
+  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index b6d34f8d89a..f2fc1889c4e 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -33,42 +33,41 @@ namespace detail {
  * strings from the lhs iterator or the rhs iterator.
  *
  * ```
- * output[i] = filter_fn(i) ? lhs(i).first : rhs(i).first
+ * output[i] = filter_fn(i) ? lhs(i) : rhs(i)
  * ```
  *
- * @tparam StringPairIterLeft Pair iterator returning thrust::pair<string_view,bool> where the
- *         bool parameter specifies if the string_view is valid (true) or not (false).
- * @tparam StringPairIterRight Pair iterator returning thrust::pair<string_view,bool> where the
- *         bool parameter specifies if the string_view is valid (true) or not (false).
+ * @tparam StringIterLeft A random access iterator whose value_type is
+ * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
+ * @tparam StringIterRight A random access iterator whose value_type is
+ * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
  * @tparam Filter Functor that takes an index and returns a boolean.
  *
- * @param lhs_begin Start of first set of data. Used when filter_fn returns true.
+ * @param lhs_begin Start of first set of data. Used when `filter_fn` returns true.
  * @param lhs_end End of first set of data.
- * @param rhs_begin Strings of second set of data. Used when filter_fn returns false.
- * @param filter_fn Called to determine which iterator (lhs or rhs) to retrieve an entry for a
- * specific row.
+ * @param rhs_begin Strings of second set of data. Used when `filter_fn` returns false.
+ * @param filter_fn Called to determine which iterator to use for a specific row.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
-template <typename StringPairIterLeft, typename StringPairIterRight, typename Filter>
+template <typename StringIterLeft, typename StringIterRight, typename Filter>
 std::unique_ptr<cudf::column> copy_if_else(
-  StringPairIterLeft lhs_begin,
-  StringPairIterLeft lhs_end,
-  StringPairIterRight rhs_begin,
+  StringIterLeft lhs_begin,
+  StringIterLeft lhs_end,
+  StringIterRight rhs_begin,
   Filter filter_fn,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   // create null mask
   auto valid_mask = cudf::detail::valid_if(
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
     [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
-      return filter_fn(idx) ? thrust::get<1>(lhs_begin[idx]) : thrust::get<1>(rhs_begin[idx]);
+      return filter_fn(idx) ? lhs_begin[idx].has_value() : rhs_begin[idx].has_value();
     },
     stream,
     mr);
@@ -77,13 +76,10 @@ std::unique_ptr<cudf::column> copy_if_else(
 
   // build offsets column
   auto offsets_transformer = [lhs_begin, rhs_begin, filter_fn] __device__(size_type idx) {
-    bool bfilter    = filter_fn(idx);
-    size_type bytes = 0;
-    if (bfilter ? thrust::get<1>(lhs_begin[idx]) : thrust::get<1>(rhs_begin[idx]))
-      bytes = bfilter ? thrust::get<0>(lhs_begin[idx]).size_bytes()
-                      : thrust::get<0>(rhs_begin[idx]).size_bytes();
-    return bytes;
+    auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
+    return result.has_value() ? result->size_bytes() : 0;
   };
+
   auto offsets_transformer_itr = thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0), offsets_transformer);
   auto offsets_column = make_offsets_child_column(
@@ -101,9 +97,9 @@ std::unique_ptr<cudf::column> copy_if_else(
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
     [lhs_begin, rhs_begin, filter_fn, d_offsets, d_chars] __device__(size_type idx) {
-      auto bfilter = filter_fn(idx);
-      if (bfilter ? !thrust::get<1>(lhs_begin[idx]) : !thrust::get<1>(rhs_begin[idx])) return;
-      string_view d_str = bfilter ? thrust::get<0>(lhs_begin[idx]) : thrust::get<0>(rhs_begin[idx]);
+      auto const result = filter_fn(idx) ? lhs_begin[idx] : rhs_begin[idx];
+      if (!result.has_value()) return;
+      auto const d_str = *result;
       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
     });
 
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 16955b3251b..ec4a88a0e46 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -227,7 +227,7 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
                                            rmm::mr::device_memory_resource* mr)
 {
   auto const output_count = std::distance(map_begin, map_end);
-  if (output_count == 0) return make_empty_column(data_type{type_id::INT8});
+  if (output_count == 0) return make_empty_column(type_id::INT8);
 
   auto chars_column  = create_chars_child_column(chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
@@ -292,14 +292,13 @@ std::unique_ptr<cudf::column> gather(
 {
   auto const output_count  = std::distance(begin, end);
   auto const strings_count = strings.size();
-  if (output_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (output_count == 0) return make_empty_column(type_id::STRING);
 
   // allocate offsets column and use memory to compute string size in each output row
   auto out_offsets_column = make_numeric_column(
     data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr);
   auto const d_out_offsets = out_offsets_column->mutable_view().template data<int32_t>();
-  auto const d_in_offsets =
-    (strings_count > 0) ? strings.offsets().data<int32_t>() + strings.offset() : nullptr;
+  auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
   thrust::transform(rmm::exec_policy(stream),
                     begin,
                     end,
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index 4657f6c83bd..a132d8c7229 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -53,7 +53,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
 {
   using cudf::detail::side;
   size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   auto lhs_column = column_device_view::create(lhs.parent(), stream);
   auto d_lhs      = *lhs_column;
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 0a53c930bb3..d1b16a5fe03 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -61,7 +61,7 @@ std::unique_ptr<column> scatter(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (target.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (target.is_empty()) return make_empty_column(type_id::STRING);
 
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index bb5e2787a14..b35f5df2903 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -62,7 +62,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(begin, end);
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   using string_index_pair = thrust::pair<const char*, size_type>;
 
@@ -163,7 +163,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
   size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(null_count < strings_count, "null strings column not yet supported");
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
new file mode 100644
index 00000000000..f6aee6d22cc
--- /dev/null
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @addtogroup strings_contains
+ * @{
+ */
+
+/**
+ * @brief Regex flags.
+ *
+ * These types can be or'd to combine them.
+ * The values are chosen to leave room for future flags
+ * and to match the Python flag values.
+ */
+enum regex_flags : uint32_t {
+  DEFAULT   = 0,  /// default
+  MULTILINE = 8,  /// the '^' and '$' honor new-line characters
+  DOTALL    = 16  /// the '.' matching includes new-line characters
+};
+
+/**
+ * @brief Returns true if the given flags contain MULTILINE.
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes MULTILINE
+ */
+constexpr bool is_multiline(regex_flags const f)
+{
+  return (f & regex_flags::MULTILINE) == regex_flags::MULTILINE;
+}
+
+/**
+ * @brief Returns true if the given flags contain DOTALL.
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes DOTALL
+ */
+constexpr bool is_dotall(regex_flags const f)
+{
+  return (f & regex_flags::DOTALL) == regex_flags::DOTALL;
+}
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index c09eedee280..fb3b9387a9b 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -52,6 +52,9 @@ class strings_column_view : private column_view {
   using column_view::offset;
   using column_view::size;
 
+  using offset_iterator = offset_type const*;
+  using chars_iterator  = char const*;
+
   /**
    * @brief Returns the parent column.
    */
@@ -64,6 +67,24 @@ class strings_column_view : private column_view {
    */
   column_view offsets() const;
 
+  /**
+   * @brief Return an iterator for the offsets child column.
+   *
+   * This automatically applies the offset of the parent.
+   *
+   * @return Iterator pointing to the first offset value.
+   */
+  offset_iterator offsets_begin() const;
+
+  /**
+   * @brief Return an end iterator for the offsets child column.
+   *
+   * This automatically applies the offset of the parent.
+   *
+   * @return Iterator pointing 1 past the last offset value.
+   */
+  offset_iterator offsets_end() const;
+
   /**
    * @brief Returns the internal column of chars
    *
@@ -78,6 +99,29 @@ class strings_column_view : private column_view {
    * view  (i.e.: non-zero offset or reduced row count).
    */
   size_type chars_size() const noexcept;
+
+  /**
+   * @brief Return an iterator for the chars child column.
+   *
+   * This does not apply the offset of the parent.
+   * The offsets child must be used to properly address the char bytes.
+   *
+   * For example, to access the first character of string `i` (accounting for
+   * a sliced column offset) use: `chars_begin()[offsets_begin()[i]]`.
+   *
+   * @return Iterator pointing to the first char byte.
+   */
+  chars_iterator chars_begin() const;
+
+  /**
+   * @brief Return an end iterator for the offsets child column.
+   *
+   * This does not apply the offset of the parent.
+   * The offsets child must be used to properly address the char bytes.
+   *
+   * @return Iterator pointing 1 past the last char byte.
+   */
+  chars_iterator chars_end() const;
 };
 
 //! Strings column APIs.
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
new file mode 100644
index 00000000000..c7513452387
--- /dev/null
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.cuh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/lists/lists_column_view.hpp>
+
+namespace cudf {
+namespace tdigest {
+
+struct tdigest_size {
+  size_type const* offsets;
+  __device__ size_type operator()(size_type tdigest_index)
+  {
+    return offsets[tdigest_index + 1] - offsets[tdigest_index];
+  }
+};
+
+/**
+ * @brief Given a column_view containing tdigest data, an instance of this class
+ * provides a wrapper on the compound column for tdigest operations.
+ *
+ * A tdigest is a "compressed" set of input scalars represented as a sorted
+ * set of centroids (https://arxiv.org/pdf/1902.04023.pdf).
+ * This data can be queried for quantile information. Each row in a tdigest
+ * column represents an entire tdigest.
+ *
+ * The column has the following structure:
+ *
+ * struct {
+ *   // centroids for the digest
+ *   list {
+ *    struct {
+ *      double    // mean
+ *      double    // weight
+ *    }
+ *   }
+ *   // these are from the input stream, not the centroids. they are used
+ *   // during the percentile_approx computation near the beginning or
+ *   // end of the quantiles
+ *   double       // min
+ *   double       // max
+ * }
+ */
+class tdigest_column_view : private column_view {
+ public:
+  tdigest_column_view(column_view const& col);
+  tdigest_column_view(tdigest_column_view&& tdigest_view)      = default;
+  tdigest_column_view(const tdigest_column_view& tdigest_view) = default;
+  ~tdigest_column_view()                                       = default;
+  tdigest_column_view& operator=(tdigest_column_view const&) = default;
+  tdigest_column_view& operator=(tdigest_column_view&&) = default;
+
+  using column_view::size;
+  static_assert(std::is_same_v<offset_type, size_type>,
+                "offset_type is expected to be the same as size_type.");
+  using offset_iterator = offset_type const*;
+
+  // mean and weight column indices within tdigest inner struct columns
+  static constexpr size_type mean_column_index{0};
+  static constexpr size_type weight_column_index{1};
+
+  // min and max column indices within tdigest outer struct columns
+  static constexpr size_type centroid_column_index{0};
+  static constexpr size_type min_column_index{1};
+  static constexpr size_type max_column_index{2};
+
+  /**
+   * @brief Returns the parent column.
+   */
+  column_view parent() const;
+
+  /**
+   * @brief Returns the column of centroids
+   */
+  lists_column_view centroids() const;
+
+  /**
+   * @brief Returns the internal column of mean values
+   */
+  column_view means() const;
+
+  /**
+   * @brief Returns the internal column of weight values
+   */
+  column_view weights() const;
+
+  /**
+   * @brief Returns an iterator that returns the size of each tdigest
+   * in the column (each row is 1 digest)
+   */
+  auto size_begin() const
+  {
+    return cudf::detail::make_counting_transform_iterator(
+      0, tdigest_size{centroids().offsets_begin()});
+  }
+
+  /**
+   * @brief Returns the first min value for the column. Each row corresponds
+   * to the minimum value for the accompanying digest.
+   */
+  double const* min_begin() const;
+
+  /**
+   * @brief Returns the first max value for the column. Each row corresponds
+   * to the maximum value for the accompanying digest.
+   */
+  double const* max_begin() const;
+};
+
+}  // namespace tdigest
+}  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index af2858d948e..55e7bc84dbe 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -140,6 +140,38 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
+ * presence of that value in `input`.
+ *
+ * The resulting per-category columns are returned concatenated as a single column viewed by a
+ * `table_view`.
+ *
+ * The `i`th row of the `j`th column in the output table equals 1
+ * if `input[i] == categories[j]`, and 0 otherwise.
+ *
+ * The `i`th row of the `j`th column in the output table equals 1
+ * if input[i] == categories[j], and 0 otherwise.
+ *
+ * Examples:
+ * @code{.pseudo}
+ * input: [{'a', 'c', null, 'c', 'b'}]
+ * categories: ['c', null]
+ * output: [{0, 1, 0, 1, 0}, {0, 0, 1, 0, 0}]
+ * @endcode
+ *
+ * @throws cudf::logic_error if input and categories are of different types.
+ *
+ * @param input Column containing values to be encoded
+ * @param categories Column containing categories
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return A pair containing the owner to all encoded data and a table view into the data
+ */
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
+  column_view const& input,
+  column_view const& categories,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Creates a boolean column from given bitmask.
  *
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 6f69dc19cfd..5fa07fd5568 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -24,7 +24,9 @@
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/file_utilities.hpp>
 
+#include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -217,6 +219,8 @@ class TempDirTestEnvironment : public ::testing::Environment {
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
+inline auto make_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
+
 inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
 
 inline auto make_pool()
@@ -224,6 +228,11 @@ inline auto make_pool()
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
 }
 
+inline auto make_arena()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+}
+
 inline auto make_binning()
 {
   auto pool = make_pool();
@@ -253,7 +262,9 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
 {
   if (allocation_mode == "binning") return make_binning();
   if (allocation_mode == "cuda") return make_cuda();
+  if (allocation_mode == "async") return make_async();
   if (allocation_mode == "pool") return make_pool();
+  if (allocation_mode == "arena") return make_arena();
   if (allocation_mode == "managed") return make_managed();
   CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode);
 }
@@ -266,6 +277,9 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
  *
  * Currently only supports 'rmm_mode' string parameter, which set the rmm
  * allocation mode. The default value of the parameter is 'pool'.
+ * Environment variable 'CUDF_TEST_RMM_MODE' can also be used to set the rmm
+ * allocation mode. If both are set, the value of 'rmm_mode' string parameter
+ * takes precedence.
  *
  * @return Parsing results in the form of unordered map
  */
@@ -273,9 +287,12 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
 {
   try {
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
+    const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
+    auto default_rmm_mode    = env_rmm_mode ? env_rmm_mode : "pool";
     options.allow_unrecognised_options().add_options()(
-      "rmm_mode", "RMM allocation mode", cxxopts::value<std::string>()->default_value("pool"));
-
+      "rmm_mode",
+      "RMM allocation mode",
+      cxxopts::value<std::string>()->default_value(default_rmm_mode));
     return options.parse(argc, argv);
   } catch (const cxxopts::OptionException& e) {
     CUDF_FAIL("Error parsing command line options");
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index e2d38b795cc..4bc48769592 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -868,7 +868,7 @@ class dictionary_column_wrapper : public detail::column_wrapper {
    */
   dictionary_column_wrapper() : column_wrapper{}
   {
-    wrapped = cudf::make_empty_column(cudf::data_type{cudf::type_id::DICTIONARY32});
+    wrapped = cudf::make_empty_column(cudf::type_id::DICTIONARY32);
   }
 
   /**
@@ -1401,7 +1401,7 @@ class lists_column_wrapper : public detail::column_wrapper {
    */
   lists_column_wrapper() : column_wrapper{}
   {
-    build_from_non_nested(make_empty_column(cudf::data_type{cudf::type_to_id<T>()}));
+    build_from_non_nested(make_empty_column(cudf::type_to_id<T>()));
   }
 
   /**
diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp
index 1588e3c9be9..42e9af47ece 100644
--- a/cpp/include/cudf_test/type_list_utilities.hpp
+++ b/cpp/include/cudf_test/type_list_utilities.hpp
@@ -32,7 +32,7 @@
  * template <class T>
  * class TestFixture : ::testing::Test { };
  *
- * TYPED_TEST_CASE(TestFixture, TestTypes);
+ * TYPED_TEST_SUITE(TestFixture, TestTypes);
  *
  * TYPED_TEST(TestFixture, mytest){
  *   using Type0 = GetType<TypeParam,0>; // the first type element
@@ -374,9 +374,8 @@ constexpr bool Exists = ExistsImpl<NEEDLE, HAYSACK>::value;
  *== false_type
  *
  * // Used as a predicate
- * using MyTypes = RemoveIf<ContainedIn<Types<Types<char, char>>,
- *                                      Types<Types<char, char>,
- *Types<float,int>>>
+ * using MyTypes = RemoveIf<ContainedIn<Types<Types<char, char>>>,
+ *                          Types<Types<char, char>, Types<float,int>>>;
  * // MyTypes == Types<float, int>
  *
  * ```
@@ -421,9 +420,8 @@ struct RemoveIfImpl<PRED, Types<HEAD, TAIL...>> {
  * RemoveIf<AllSame, Types<Types<int, float, int>>> ==  Types<Types<int, float,
  *int>>
  *
- * using MyTypes = RemoveIf<ContainedIn<Types<Types<char, char>>,
- *                                      Types<Types<char, char>,
- *Types<float,int>>>
+ * using MyTypes = RemoveIf<ContainedIn<Types<Types<char, char>>>,
+ *                          Types<Types<char, char>, Types<float,int>>>;
  * // MyTypes == Types<float, int>
  * ```
  *
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index 8e48724f959..2a02caa0326 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -176,7 +176,7 @@ using IntegralTypes = Concat<IntegralTypesNotBool, cudf::test::Types<bool>>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all floating point types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FloatingPointTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FloatingPointTypes);
  * ```
  */
 using FloatingPointTypes = cudf::test::Types<float, double>;
@@ -188,7 +188,7 @@ using FloatingPointTypes = cudf::test::Types<float, double>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all numeric types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::NumericTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::NumericTypes);
  * ```
  */
 using NumericTypes = Concat<IntegralTypes, FloatingPointTypes>;
@@ -200,7 +200,7 @@ using NumericTypes = Concat<IntegralTypes, FloatingPointTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all timestamp types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::TimestampTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::TimestampTypes);
  * ```
  */
 using TimestampTypes =
@@ -213,7 +213,7 @@ using TimestampTypes =
  * Example:
  * ```
  * // Invokes all typed fixture tests for all duration types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::DurationTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::DurationTypes);
  * ```
  */
 using DurationTypes =
@@ -225,7 +225,7 @@ using DurationTypes =
  * Example:
  * ```
  * // Invokes all typed fixture tests for all chrono types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::ChronoTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::ChronoTypes);
  * ```
  */
 using ChronoTypes = Concat<TimestampTypes, DurationTypes>;
@@ -237,7 +237,7 @@ using ChronoTypes = Concat<TimestampTypes, DurationTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all string types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::StringTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::StringTypes);
  * ```
  */
 using StringTypes = cudf::test::Types<string_view>;
@@ -249,7 +249,7 @@ using StringTypes = cudf::test::Types<string_view>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all list types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::ListTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::ListTypes);
  * ```
  */
 using ListTypes = cudf::test::Types<list_view>;
@@ -261,7 +261,7 @@ using ListTypes = cudf::test::Types<list_view>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedPointTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedPointTypes);
  * ```
  */
 using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64>;
@@ -273,7 +273,7 @@ using FixedPointTypes = cudf::test::Types<numeric::decimal32, numeric::decimal64
  * Example:
  * ```
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedWidthTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedWidthTypes);
  * ```
  */
 using FixedWidthTypes = Concat<NumericTypes, ChronoTypes, FixedPointTypes>;
@@ -287,7 +287,7 @@ using FixedWidthTypes = Concat<NumericTypes, ChronoTypes, FixedPointTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
  * ```
  */
 using FixedWidthTypesWithoutFixedPoint = Concat<NumericTypes, ChronoTypes>;
@@ -299,7 +299,7 @@ using FixedWidthTypesWithoutFixedPoint = Concat<NumericTypes, ChronoTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all fixed-width types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::FixedWidthTypesWithoutChrono);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::FixedWidthTypesWithoutChrono);
  * ```
  */
 using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
@@ -310,7 +310,7 @@ using FixedWidthTypesWithoutChrono = Concat<NumericTypes, FixedPointTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all sortable types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::ComparableTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::ComparableTypes);
  * ```
  */
 using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
@@ -321,7 +321,7 @@ using ComparableTypes = Concat<NumericTypes, ChronoTypes, StringTypes>;
  * Example:
  * ```
  * // Invokes all typed fixture tests for all compound types in libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::CompoundTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::CompoundTypes);
  * ```
  */
 using CompoundTypes =
@@ -337,10 +337,10 @@ using CompoundTypes =
  * Example:
  * ```
  * // Invokes all typed fixture tests for all types supported by libcudf
- * TYPED_TEST_CASE(MyTypedFixture, cudf::test::AllTypes);
+ * TYPED_TEST_SUITE(MyTypedFixture, cudf::test::AllTypes);
  * ```
  */
-using AllTypes = Concat<NumericTypes, ChronoTypes>;
+using AllTypes = Concat<NumericTypes, ChronoTypes, FixedPointTypes>;
 
 /**
  * @brief `std::array` of all `cudf::type_id`s
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 9f060c93215..d6fd7d02b44 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -1,22 +1,21 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.10/RAPIDS.cmake
-     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+     ${CMAKE_BINARY_DIR}/RAPIDS.cmake
+)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 
 include(rapids-cmake)
@@ -25,19 +24,23 @@ include(rapids-cuda)
 include(rapids-export)
 include(rapids-find)
 
-project(CUDA_KAFKA VERSION 21.12.00 LANGUAGES CXX)
+project(
+  CUDA_KAFKA
+  VERSION 21.12.00
+  LANGUAGES CXX
+)
 
 # Set a default build type if none was specified
 rapids_cmake_build_type(Release)
 
-###################################################################################################
-# - Build options
+# ##################################################################################################
+# * Build options
 option(BUILD_TESTS "Build tests for libcudf_kafka" ON)
 
 message(VERBOSE "CUDF_KAFKA: Build gtests: ${BUILD_TESTS}")
 
-###################################################################################################
-# - Dependencies
+# ##################################################################################################
+# * Dependencies
 
 # add third party dependencies using CPM
 rapids_cpm_init()
@@ -45,56 +48,57 @@ include(cmake/thirdparty/get_cudf.cmake)
 include(cmake/thirdparty/get_rdkafka.cmake)
 
 # # GTests if enabled
-if (BUILD_TESTS)
-    # GoogleTest
-    include(../cmake/thirdparty/get_gtest.cmake)
+if(BUILD_TESTS)
+  # GoogleTest
+  include(../cmake/thirdparty/get_gtest.cmake)
 
-    # include CTest module -- automatically calls enable_testing()
-    include(CTest)
-    add_subdirectory(tests)
+  # include CTest module -- automatically calls enable_testing()
+  include(CTest)
+  add_subdirectory(tests)
 endif()
 
-###################################################################################################
-# - library target --------------------------------------------------------------------------------
-add_library(cudf_kafka SHARED
-    src/kafka_consumer.cpp)
+# ##################################################################################################
+# * library target --------------------------------------------------------------------------------
+add_library(cudf_kafka SHARED src/kafka_consumer.cpp)
 
-###################################################################################################
-# - include paths ---------------------------------------------------------------------------------
-target_include_directories(cudf_kafka
-                PUBLIC
-                    "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
-                    "$<INSTALL_INTERFACE:include>")
+# ##################################################################################################
+# * include paths ---------------------------------------------------------------------------------
+target_include_directories(
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+                    "$<INSTALL_INTERFACE:include>"
+)
 
-###################################################################################################
-# - library paths ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * library paths ---------------------------------------------------------------------------------
 target_link_libraries(cudf_kafka PUBLIC cudf::cudf RDKAFKA::RDKAFKA)
 
-set_target_properties(cudf_kafka
-    PROPERTIES BUILD_RPATH                         "\$ORIGIN"
-               INSTALL_RPATH                       "\$ORIGIN"
-               # set target compile options
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON)
+set_target_properties(
+  cudf_kafka
+  PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options
+             CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON
+)
 
-###################################################################################################
-# - cudf_kafka Install ----------------------------------------------------------------------------
+# ##################################################################################################
+# * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
-install(TARGETS cudf_kafka
-        DESTINATION ${lib_dir}
-        EXPORT cudf_kafka-exports)
-
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include
-        DESTINATION include)
-
-rapids_export(INSTALL cudf_kafka
-    EXPORT_SET cudf_kafka-exports
-    GLOBAL_TARGETS cudf_kafka
-    NAMESPACE cudf_kafka::
-    )
-
-rapids_export(BUILD cudf_kafka
-    EXPORT_SET cudf_kafka-exports
-    GLOBAL_TARGETS cudf_kafka
-    NAMESPACE cudf_kafka::
-    )
+install(
+  TARGETS cudf_kafka
+  DESTINATION ${lib_dir}
+  EXPORT cudf_kafka-exports
+)
+
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+
+rapids_export(
+  INSTALL cudf_kafka
+  EXPORT_SET cudf_kafka-exports
+  GLOBAL_TARGETS cudf_kafka
+  NAMESPACE cudf_kafka::
+)
+
+rapids_export(
+  BUILD cudf_kafka
+  EXPORT_SET cudf_kafka-exports
+  GLOBAL_TARGETS cudf_kafka
+  NAMESPACE cudf_kafka::
+)
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index ea749726b97..1e04d40a7d5 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,51 +1,55 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
+# This function finds cudf and sets any additional necessary environment variables.
 function(find_and_configure_cudf VERSION)
-    rapids_cmake_parse_version(MAJOR_MINOR ${VERSION} major_minor)
-    rapids_cpm_find(cudf ${VERSION}
-        BUILD_EXPORT_SET cudf_kafka-exports
-        INSTALL_EXPORT_SET cudf_kafka-exports
-        CPM_ARGS
-            GIT_REPOSITORY  https://github.com/rapidsai/cudf.git
-            GIT_TAG         branch-${major_minor}
-            GIT_SHALLOW     TRUE
-            SOURCE_SUBDIR   cpp
-            OPTIONS         "BUILD_TESTS OFF"
-                            "BUILD_BENCHMARKS OFF")
-    # If after loading cudf we now have the CMAKE_CUDA_COMPILER
-    # variable we know that we need to re-enable the cuda language
-    if(CMAKE_CUDA_COMPILER)
-        set(cudf_REQUIRES_CUDA TRUE PARENT_SCOPE)
-    endif()
+  rapids_cmake_parse_version(MAJOR_MINOR ${VERSION} major_minor)
+  rapids_cpm_find(
+    cudf ${VERSION}
+    BUILD_EXPORT_SET cudf_kafka-exports
+    INSTALL_EXPORT_SET cudf_kafka-exports
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/rapidsai/cudf.git
+    GIT_TAG branch-${major_minor}
+    GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF"
+  )
+  # If after loading cudf we now have the CMAKE_CUDA_COMPILER variable we know that we need to
+  # re-enable the cuda language
+  if(CMAKE_CUDA_COMPILER)
+    set(cudf_REQUIRES_CUDA
+        TRUE
+        PARENT_SCOPE
+    )
+  endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}")
+set(CUDA_KAFKA_MIN_VERSION_cudf
+    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+)
 find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
 
 if(cudf_REQUIRES_CUDA)
-    rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDA_KAFKA)
 
-    # Since we are building cudf as part of ourselves we need
-    # to enable the CUDA language in the top-most scope
-    enable_language(CUDA)
+  # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
+  # top-most scope
+  enable_language(CUDA)
 
-    # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
-    # rapids_cuda_init_architectures relies on `project` calling
-    if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-        include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
-    endif()
+  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # rapids_cuda_init_architectures relies on `project` calling
+  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  endif()
 endif()
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
index 3a4fffd5000..3b3342cb297 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_rdkafka.cmake
@@ -1,40 +1,40 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
-
-function( get_RDKafka )
-  rapids_find_generate_module(RDKAFKA
+# This function finds rdkafka and sets any additional necessary environment variables.
+function(get_RDKafka)
+  rapids_find_generate_module(
+    RDKAFKA
     HEADER_NAMES rdkafkacpp.h
     INCLUDE_SUFFIXES librdkafka
     LIBRARY_NAMES rdkafka++
     BUILD_EXPORT_SET cudf_kafka-exports
     INSTALL_EXPORT_SET cudf_kafka-exports
-    )
+  )
 
   if(DEFINED ENV{RDKAFKA_ROOT})
-    # Since this is inside a function the modification of
-    # CMAKE_PREFIX_PATH won't leak to other callers/users
+    # Since this is inside a function the modification of CMAKE_PREFIX_PATH won't leak to other
+    # callers/users
     list(APPEND CMAKE_PREFIX_PATH "$ENV{RDKAFKA_ROOT}")
     list(APPEND CMAKE_PREFIX_PATH "$ENV{RDKAFKA_ROOT}/build")
   endif()
 
-
-  rapids_find_package(RDKAFKA REQUIRED
+  rapids_find_package(
+    RDKAFKA REQUIRED
     BUILD_EXPORT_SET cudf_kafka-exports
-    INSTALL_EXPORT_SET cudf_kafka-exports)
+    INSTALL_EXPORT_SET cudf_kafka-exports
+  )
 
 endfunction()
 
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index f0c2664cd96..3920758f3f2 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -1,32 +1,34 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 
-###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
 
-function(ConfigureTest test_name )
-    add_executable(${test_name} ${ARGN})
-    set_target_properties(${test_name}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>")
-    target_link_libraries(${test_name} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
+# This function takes in a test name and test source and handles setting all of the associated
+# properties and linking to build the test
+function(ConfigureTest test_name)
+  add_executable(${test_name} ${ARGN})
+  set_target_properties(
+    ${test_name} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                            "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+  )
+  target_link_libraries(${test_name} PRIVATE GTest::gmock_main GTest::gtest_main cudf_kafka)
 
-    add_test(NAME ${test_name} COMMAND ${test_name})
+  add_test(NAME ${test_name} COMMAND ${test_name})
 endfunction()
 
-###################################################################################################
-# - Kafka host tests ----------------------------------------------------------------------------------
-ConfigureTest(KAFKA_HOST_TEST
-    kafka_consumer_tests.cpp)
+# ##################################################################################################
+# * Kafka host tests
+# ----------------------------------------------------------------------------------
+ConfigureTest(KAFKA_HOST_TEST kafka_consumer_tests.cpp)
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
new file mode 100755
index 00000000000..76de008b14a
--- /dev/null
+++ b/cpp/scripts/run-cmake-format.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
+# wrapping is necessary because RAPIDS libraries split configuration for
+# cmakelang linters between a local config file and a second config file that's
+# shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
+# this file is only maintained in one place (the rapids-cmake repo) and
+# pulled down during builds. We need a way to invoke CMake linting commands
+# without causing pre-commit failures (which could block local commits or CI),
+# while also being sufficiently flexible to allow users to maintain the config
+# file independently of a build directory.
+#
+# This script provides the minimal functionality to enable those use cases. It
+# searches in a number of predefined locations for the rapids-cmake config file
+# and exits gracefully if the file is not found. If a user wishes to specify a
+# config file at a nonstandard location, they may do so by setting the
+# environment variable RAPIDS_CMAKE_FORMAT_FILE.
+# 
+# While this script can be invoked directly (but only from the repo root since
+# all paths are relative to that), it is advisable to instead use the
+# pre-commit hooks via
+# `pre-commit run (cmake-format)|(cmake-format)`.
+#
+# Usage:
+# bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
+
+# Note that pre-commit always runs from the root of the repository, so relative
+# paths are automatically relative to the repo root.
+DEFAULT_FORMAT_FILE_LOCATIONS=(
+  "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" 
+  "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+)
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+    for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
+        if [ -f ${file_path} ]; then
+            RAPIDS_CMAKE_FORMAT_FILE=${file_path}
+            break
+        fi
+    done
+fi
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+  echo "The rapids-cmake cmake-format configuration file was not found at any of the default search locations: "
+  echo ""
+  ( IFS=$'\n'; echo "${DEFAULT_FORMAT_FILE_LOCATIONS[*]}" )
+  echo ""
+  echo "Try setting the environment variable RAPIDS_CMAKE_FORMAT_FILE to the path to the config file."
+  exit 0
+else
+  echo "Using format file ${RAPIDS_CMAKE_FORMAT_FILE}"
+fi
+
+if [[ $1 == "cmake-format" ]]; then
+  cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+elif [[ $1 == "cmake-lint" ]]; then
+  cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+fi
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 7adf2d5112a..31bf9d65d56 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -202,6 +202,16 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, covariance_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, correlation_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, tdigest_aggregation const& agg)
 {
@@ -358,6 +368,16 @@ void aggregation_finalizer::visit(merge_m2_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(covariance_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
+void aggregation_finalizer::visit(correlation_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(tdigest_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -691,6 +711,28 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
 template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
 
+/// Factory to create a COVARIANCE aggregation
+template <typename Base>
+std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_type ddof)
+{
+  return std::make_unique<detail::covariance_aggregation>(min_periods, ddof);
+}
+template std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
+  size_type min_periods, size_type ddof);
+template std::unique_ptr<groupby_aggregation> make_covariance_aggregation<groupby_aggregation>(
+  size_type min_periods, size_type ddof);
+
+/// Factory to create a CORRELATION aggregation
+template <typename Base>
+std::unique_ptr<Base> make_correlation_aggregation(correlation_type type, size_type min_periods)
+{
+  return std::make_unique<detail::correlation_aggregation>(type, min_periods);
+}
+template std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
+  correlation_type type, size_type min_periods);
+template std::unique_ptr<groupby_aggregation> make_correlation_aggregation<groupby_aggregation>(
+  correlation_type type, size_type min_periods);
+
 template <typename Base>
 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
 {
diff --git a/cpp/src/aggregation/result_cache.cpp b/cpp/src/aggregation/result_cache.cpp
index 1889ae67ee3..ea6894b5ed3 100644
--- a/cpp/src/aggregation/result_cache.cpp
+++ b/cpp/src/aggregation/result_cache.cpp
@@ -30,10 +30,10 @@ void result_cache::add_result(column_view const& input,
 {
   // We can't guarantee that agg will outlive the cache, so we need to take ownership of a copy.
   // To allow lookup by reference, make the key a reference and keep the owner in the value pair.
-  auto owned_agg       = agg.clone();
-  auto const& key      = *owned_agg;
-  auto value           = std::make_pair(std::move(owned_agg), std::move(col));
-  _cache[{input, key}] = std::move(value);
+  auto owned_agg  = agg.clone();
+  auto const& key = *owned_agg;
+  // try_emplace doesn't update/insert if already present
+  _cache.try_emplace({input, key}, std::move(owned_agg), std::move(col));
 }
 
 column_view result_cache::get_result(column_view const& input, aggregation const& agg) const
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 6b03a97c59b..73a3f55163d 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -59,9 +59,9 @@ rmm::device_buffer scalar_col_valid_mask_and(column_view const& col,
 {
   if (col.is_empty()) return rmm::device_buffer{0, stream, mr};
 
-  if (not s.is_valid()) {
+  if (not s.is_valid(stream)) {
     return cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr);
-  } else if (s.is_valid() and col.nullable()) {
+  } else if (s.is_valid(stream) and col.nullable()) {
     return cudf::detail::copy_bitmask(col, stream, mr);
   } else {
     return rmm::device_buffer{0, stream, mr};
@@ -152,7 +152,7 @@ void binary_operation(mutable_column_view& out,
                out.null_mask(),
                lhs.null_mask(),
                lhs.offset(),
-               rhs.is_valid());
+               rhs.is_valid(stream));
   } else {
     std::string kernel_name =
       jitify2::reflection::Template("cudf::binops::jit::kernel_v_s")  //
@@ -456,14 +456,14 @@ std::unique_ptr<column> fixed_point_binary_operation(scalar const& lhs,
       auto const diff = lhs.type().scale() - rhs.type().scale();
       if (lhs.type().id() == type_id::DECIMAL32) {
         auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value();
+        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(lhs).value(stream);
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
       } else {
         CUDF_EXPECTS(lhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value();
+        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(lhs).value(stream);
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, *scalar, rhs, op, stream);
@@ -526,14 +526,14 @@ std::unique_ptr<column> fixed_point_binary_operation(column_view const& lhs,
       auto const diff = rhs.type().scale() - lhs.type().scale();
       if (rhs.type().id() == type_id::DECIMAL32) {
         auto const factor = numeric::detail::ipow<int32_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(rhs).value();
+        auto const val    = static_cast<fixed_point_scalar<decimal32> const&>(rhs).value(stream);
         auto const scale  = scale_type{lhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal32>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
       } else {
         CUDF_EXPECTS(rhs.type().id() == type_id::DECIMAL64, "Unexpected DTYPE");
         auto const factor = numeric::detail::ipow<int64_t, Radix::BASE_10>(diff);
-        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value();
+        auto const val    = static_cast<fixed_point_scalar<decimal64> const&>(rhs).value(stream);
         auto const scale  = scale_type{rhs.type().scale()};
         auto const scalar = make_fixed_point_scalar<decimal64>(val * factor, scale);
         binops::jit::binary_operation(out_view, lhs, *scalar, op, stream);
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 86059a72e8f..fefe0b3c862 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -68,6 +68,9 @@ std::unique_ptr<column> make_empty_column(data_type type)
   return std::make_unique<column>(type, 0, rmm::device_buffer{});
 }
 
+// Empty column of specified type id
+std::unique_ptr<column> make_empty_column(type_id id) { return make_empty_column(data_type{id}); }
+
 // Allocate storage for a specified number of numeric elements
 std::unique_ptr<column> make_numeric_column(data_type type,
                                             size_type size,
@@ -163,8 +166,8 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  if (size == 0) return make_empty_column(data_type{type_id::DICTIONARY32});
-  CUDF_EXPECTS(s.is_valid(), "cannot create a dictionary with a null key");
+  if (size == 0) return make_empty_column(type_id::DICTIONARY32);
+  CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
   return make_dictionary_column(
     make_column_from_scalar(s, 1, stream, mr),
     make_column_from_scalar(numeric_scalar<uint32_t>(0), size, stream, mr),
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index ebe483f9725..6b74b37044b 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -34,7 +34,7 @@ struct column_from_scalar_dispatch {
                                            rmm::mr::device_memory_resource* mr) const
   {
     if (size == 0) return make_empty_column(value.type());
-    if (!value.is_valid())
+    if (!value.is_valid(stream))
       return make_fixed_width_column(value.type(), size, mask_state::ALL_NULL, stream, mr);
     auto output_column =
       make_fixed_width_column(value.type(), size, mask_state::UNALLOCATED, stream, mr);
@@ -54,7 +54,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   if (size == 0) return make_empty_column(value.type());
   auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
 
-  if (!value.is_valid())
+  if (!value.is_valid(stream))
     return std::make_unique<column>(
       value.type(), size, rmm::device_buffer{}, std::move(null_mask), size);
 
@@ -101,7 +101,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
 
   auto children =
     detail::gather(ss.view(), iter, iter + size, out_of_bounds_policy::NULLIFY, stream, mr);
-  auto const is_valid = ss.is_valid();
+  auto const is_valid = ss.is_valid(stream);
   return make_structs_column(size,
                              std::move(children->release()),
                              is_valid ? 0 : size,
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 21a27ff8c3d..f4d09c8e0be 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -20,10 +20,13 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 namespace cudf {
 namespace detail {
@@ -72,28 +75,18 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    if (left_nullable) {
-      if (right_nullable) {
-        auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
-        auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-        return detail::copy_if_else(
-          true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
-      }
-      auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-      return detail::copy_if_else(
-        true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
-    }
-    if (right_nullable) {
-      auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-      return detail::copy_if_else(
-        true, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
-    }
-    auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
-    auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-    return detail::copy_if_else(
-      false, lhs_iter, lhs_iter + size, rhs_iter, filter, lhs.type(), stream, mr);
+    auto lhs_iter =
+      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
+    auto rhs_iter =
+      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
+    return detail::copy_if_else(left_nullable || right_nullable,
+                                lhs_iter,
+                                lhs_iter + size,
+                                rhs_iter,
+                                filter,
+                                lhs.type(),
+                                stream,
+                                mr);
   }
 };
 
@@ -119,24 +112,10 @@ struct copy_if_else_functor_impl<string_view> {
     auto const& lhs = *p_lhs;
     auto const& rhs = *p_rhs;
 
-    if (left_nullable) {
-      if (right_nullable) {
-        auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
-        auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-        return strings::detail::copy_if_else(
-          lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
-      }
-      auto lhs_iter = cudf::detail::make_pair_iterator<T, true>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
-      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
-    }
-    if (right_nullable) {
-      auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
-      auto rhs_iter = cudf::detail::make_pair_iterator<T, true>(rhs);
-      return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
-    }
-    auto lhs_iter = cudf::detail::make_pair_iterator<T, false>(lhs);
-    auto rhs_iter = cudf::detail::make_pair_iterator<T, false>(rhs);
+    auto lhs_iter =
+      cudf::detail::make_optional_iterator<T>(lhs, contains_nulls::DYNAMIC{}, left_nullable);
+    auto rhs_iter =
+      cudf::detail::make_optional_iterator<T>(rhs, contains_nulls::DYNAMIC{}, right_nullable);
     return strings::detail::copy_if_else(lhs_iter, lhs_iter + size, rhs_iter, filter, stream, mr);
   }
 };
@@ -260,6 +239,38 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
   return scatter_gather_based_if_else(lhs, rhs_col->view(), size, is_left, stream, mr);
 }
 
+template <>
+struct copy_if_else_functor_impl<struct_view> {
+  template <typename Left, typename Right, typename Filter>
+  std::unique_ptr<column> operator()(Left const& lhs,
+                                     Right const& rhs,
+                                     size_type size,
+                                     bool,
+                                     bool,
+                                     Filter filter,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
+  }
+};
+
+template <>
+struct copy_if_else_functor_impl<list_view> {
+  template <typename Left, typename Right, typename Filter>
+  std::unique_ptr<column> operator()(Left const& lhs,
+                                     Right const& rhs,
+                                     size_type size,
+                                     bool,
+                                     bool,
+                                     Filter filter,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
+  }
+};
+
 /**
  * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations
  *        of column_view and scalar
@@ -275,12 +286,6 @@ struct copy_if_else_functor {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    if constexpr (std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>) {
-      (void)left_nullable;
-      (void)right_nullable;
-      return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
-    }
-
     copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
   }
@@ -305,35 +310,21 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
   auto bool_mask_device_p             = column_device_view::create(boolean_mask);
   column_device_view bool_mask_device = *bool_mask_device_p;
 
-  if (boolean_mask.has_nulls()) {
-    auto filter = [bool_mask_device] __device__(cudf::size_type i) {
-      return bool_mask_device.is_valid_nocheck(i) and bool_mask_device.element<bool>(i);
-    };
-    return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
-                                                        copy_if_else_functor{},
-                                                        lhs,
-                                                        rhs,
-                                                        boolean_mask.size(),
-                                                        left_nullable,
-                                                        right_nullable,
-                                                        filter,
-                                                        stream,
-                                                        mr);
-  } else {
-    auto filter = [bool_mask_device] __device__(cudf::size_type i) {
-      return bool_mask_device.element<bool>(i);
-    };
-    return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
-                                                        copy_if_else_functor{},
-                                                        lhs,
-                                                        rhs,
-                                                        boolean_mask.size(),
-                                                        left_nullable,
-                                                        right_nullable,
-                                                        filter,
-                                                        stream,
-                                                        mr);
-  }
+  auto const has_nulls = boolean_mask.has_nulls();
+  auto filter          = [bool_mask_device, has_nulls] __device__(cudf::size_type i) {
+    return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and
+           bool_mask_device.element<bool>(i);
+  };
+  return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
+                                                      copy_if_else_functor{},
+                                                      lhs,
+                                                      rhs,
+                                                      boolean_mask.size(),
+                                                      left_nullable,
+                                                      right_nullable,
+                                                      filter,
+                                                      stream,
+                                                      mr);
 }
 
 };  // namespace
@@ -358,7 +349,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column");
-  return copy_if_else(lhs, rhs, !lhs.is_valid(), rhs.has_nulls(), boolean_mask, stream, mr);
+  return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
@@ -369,7 +360,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column");
-  return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(), boolean_mask, stream, mr);
+  return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -378,7 +369,8 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  return copy_if_else(lhs, rhs, !lhs.is_valid(), !rhs.is_valid(), boolean_mask, stream, mr);
+  return copy_if_else(
+    lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
 
 };  // namespace detail
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 42dc9f76b18..3e0b27e9f19 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -70,8 +70,9 @@ std::unique_ptr<table> sample(table_view const& input,
                          gather_map_mutable_view.begin<size_type>(),
                          thrust::default_random_engine(seed));
 
-    auto gather_map_view =
-      (n == num_rows) ? gather_map->view() : cudf::slice(gather_map->view(), {0, n})[0];
+    auto gather_map_view = (n == num_rows)
+                             ? gather_map->view()
+                             : cudf::detail::slice(gather_map->view(), {0, n}, stream)[0];
     return detail::gather(input,
                           gather_map_view.begin<size_type>(),
                           gather_map_view.end<size_type>(),
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index 6fc785a61c6..62f992012cd 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/copy_if_else.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -33,88 +34,24 @@ namespace detail {
 namespace {
 
 /**
- * @brief Helper function to invoke general `copy_if_else`
+ * @brief Common filter function to convert index values into copy-if-else left/right result.
+ *
+ * The offset position is used to identify which segment to copy from.
  */
-template <typename PairIterator, typename ScalarIterator>
-std::unique_ptr<column> segmented_shift_rep_impl(PairIterator input_pair_iterator,
-                                                 ScalarIterator fill_pair_iterator,
-                                                 bool nullable,
-                                                 size_type offset,
-                                                 device_span<size_type const> segment_offsets,
-                                                 data_type value_type,
-                                                 size_type column_size,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  if (offset > 0) {
-    auto filter = [segment_offsets, offset] __device__(auto const& i) {
-      auto segment_bound_idx =
-        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
-      return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
-    };
-    return copy_if_else(nullable,
-                        input_pair_iterator,
-                        input_pair_iterator + column_size,
-                        fill_pair_iterator,
-                        filter,
-                        value_type,
-                        stream,
-                        mr);
-  } else {
-    auto filter = [segment_offsets, offset] __device__(auto const& i) {
-      auto segment_bound_idx =
-        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
-      return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
-    };
-    return copy_if_else(nullable,
-                        input_pair_iterator,
-                        input_pair_iterator + column_size,
-                        fill_pair_iterator,
-                        filter,
-                        value_type,
-                        stream,
-                        mr);
-  }
-}
+struct segmented_shift_filter {
+  device_span<size_type const> const segment_offsets;
+  size_type const offset;
 
-/**
- * @brief Helper function to invoke string specialization of `copy_if_else`
- */
-template <typename PairIterator, typename ScalarIterator>
-std::unique_ptr<column> segmented_shift_string_impl(PairIterator input_pair_iterator,
-                                                    ScalarIterator fill_pair_iterator,
-                                                    size_type offset,
-                                                    device_span<size_type const> segment_offsets,
-                                                    size_type column_size,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
-{
-  if (offset > 0) {
-    auto filter = [segment_offsets, offset] __device__(auto const& i) {
-      auto segment_bound_idx =
-        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) - 1;
-      return not(*segment_bound_idx <= i and i < *segment_bound_idx + offset);
-    };
-    return strings::detail::copy_if_else(input_pair_iterator,
-                                         input_pair_iterator + column_size,
-                                         fill_pair_iterator,
-                                         filter,
-                                         stream,
-                                         mr);
-  } else {
-    auto filter = [segment_offsets, offset] __device__(auto const& i) {
-      auto segment_bound_idx =
-        thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i);
-      return not(*segment_bound_idx + offset <= i and i < *segment_bound_idx);
-    };
-    return strings::detail::copy_if_else(input_pair_iterator,
-                                         input_pair_iterator + column_size,
-                                         fill_pair_iterator,
-                                         filter,
-                                         stream,
-                                         mr);
-  }
-}
+  __device__ bool operator()(size_type const i) const
+  {
+    auto const segment_bound_idx =
+      thrust::upper_bound(thrust::seq, segment_offsets.begin(), segment_offsets.end(), i) -
+      (offset > 0);
+    auto const left_idx  = *segment_bound_idx + (offset < 0 ? offset : 0);
+    auto const right_idx = *segment_bound_idx + (offset > 0 ? offset : 0);
+    return not(left_idx <= i and i < right_idx);
+  };
+};
 
 template <typename T, typename Enable = void>
 struct segmented_shift_functor {
@@ -138,32 +75,20 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
                                      rmm::mr::device_memory_resource* mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
-    auto fill_pair_iterator = make_pair_iterator<T>(fill_value);
-    bool nullable           = not fill_value.is_valid() or segmented_values.nullable();
-
-    if (segmented_values.has_nulls()) {
-      auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
-      return segmented_shift_rep_impl(input_pair_iterator,
-                                      fill_pair_iterator,
-                                      nullable,
-                                      offset,
-                                      segment_offsets,
-                                      segmented_values.type(),
-                                      segmented_values.size(),
-                                      stream,
-                                      mr);
-    } else {
-      auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
-      return segmented_shift_rep_impl(input_pair_iterator,
-                                      fill_pair_iterator,
-                                      nullable,
-                                      offset,
-                                      segment_offsets,
-                                      segmented_values.type(),
-                                      segmented_values.size(),
-                                      stream,
-                                      mr);
-    }
+    bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
+    auto input_iterator =
+      cudf::detail::make_optional_iterator<T>(
+        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
+      offset;
+    auto fill_iterator = cudf::detail::make_optional_iterator<T>(fill_value, contains_nulls::YES{});
+    return copy_if_else(nullable,
+                        input_iterator,
+                        input_iterator + segmented_values.size(),
+                        fill_iterator,
+                        segmented_shift_filter{segment_offsets, offset},
+                        segmented_values.type(),
+                        stream,
+                        mr);
   }
 };
 
@@ -179,29 +104,19 @@ struct segmented_shift_functor<string_view> {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    using T = string_view;
-
     auto values_device_view = column_device_view::create(segmented_values, stream);
-    auto fill_pair_iterator = make_pair_iterator<T>(fill_value);
-    if (segmented_values.has_nulls()) {
-      auto input_pair_iterator = make_pair_iterator<T, true>(*values_device_view) - offset;
-      return segmented_shift_string_impl(input_pair_iterator,
-                                         fill_pair_iterator,
-                                         offset,
-                                         segment_offsets,
-                                         segmented_values.size(),
-                                         stream,
-                                         mr);
-    } else {
-      auto input_pair_iterator = make_pair_iterator<T, false>(*values_device_view) - offset;
-      return segmented_shift_string_impl(input_pair_iterator,
-                                         fill_pair_iterator,
-                                         offset,
-                                         segment_offsets,
-                                         segmented_values.size(),
+    auto input_iterator =
+      make_optional_iterator<cudf::string_view>(
+        *values_device_view, contains_nulls::DYNAMIC{}, segmented_values.has_nulls()) -
+      offset;
+    auto fill_iterator =
+      make_optional_iterator<cudf::string_view>(fill_value, contains_nulls::YES{});
+    return strings::detail::copy_if_else(input_iterator,
+                                         input_iterator + segmented_values.size(),
+                                         fill_iterator,
+                                         segmented_shift_filter{segment_offsets, offset},
                                          stream,
                                          mr);
-    }
   }
 };
 
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index d1c12056393..9a3e349b907 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -29,7 +29,7 @@
 namespace cudf {
 namespace detail {
 std::vector<column_view> slice(column_view const& input,
-                               std::vector<size_type> const& indices,
+                               host_span<size_type const> indices,
                                rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
@@ -64,16 +64,15 @@ std::vector<column_view> slice(column_view const& input,
 }
 
 std::vector<table_view> slice(table_view const& input,
-                              std::vector<size_type> const& indices,
+                              host_span<size_type const> indices,
                               rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() % 2 == 0, "indices size must be even");
   if (indices.empty()) { return {}; }
 
   // 2d arrangement of column_views that represent the outgoing table_views sliced_table[i][j]
   // where i is the i'th column of the j'th table_view
-  auto op = [&indices, stream](auto const& c) { return cudf::detail::slice(c, indices, stream); };
+  auto op = [&indices, &stream](auto const& c) { return cudf::detail::slice(c, indices, stream); };
   auto f  = thrust::make_transform_iterator(input.begin(), op);
 
   auto sliced_table = std::vector<std::vector<cudf::column_view>>(f, f + input.num_columns());
@@ -93,20 +92,44 @@ std::vector<table_view> slice(table_view const& input,
   return result;
 }
 
+std::vector<column_view> slice(column_view const& input,
+                               std::initializer_list<size_type> indices,
+                               rmm::cuda_stream_view stream)
+{
+  return slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
+}
+
+std::vector<table_view> slice(table_view const& input,
+                              std::initializer_list<size_type> indices,
+                              rmm::cuda_stream_view stream)
+{
+  return slice(input, host_span<size_type const>(indices.begin(), indices.size()), stream);
+};
+
 }  // namespace detail
 
-std::vector<cudf::column_view> slice(cudf::column_view const& input,
-                                     std::vector<size_type> const& indices)
+std::vector<column_view> slice(column_view const& input, host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
   return detail::slice(input, indices, rmm::cuda_stream_default);
 }
 
-std::vector<cudf::table_view> slice(cudf::table_view const& input,
-                                    std::vector<size_type> const& indices)
+std::vector<table_view> slice(table_view const& input, host_span<size_type const> indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+};
+
+std::vector<column_view> slice(column_view const& input, std::initializer_list<size_type> indices)
 {
   CUDF_FUNC_RANGE();
   return detail::slice(input, indices, rmm::cuda_stream_default);
 }
 
+std::vector<table_view> slice(table_view const& input, std::initializer_list<size_type> indices)
+{
+  CUDF_FUNC_RANGE();
+  return detail::slice(input, indices, rmm::cuda_stream_default);
+};
+
 }  // namespace cudf
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 97520800408..0fa802eb4b2 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -15,16 +15,22 @@
  */
 
 #include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <algorithm>
 
 namespace cudf {
+namespace detail {
 namespace {
 template <typename T>
-std::vector<T> split(T const& input, size_type column_size, std::vector<size_type> const& splits)
+std::vector<T> split(T const& input,
+                     size_type column_size,
+                     host_span<size_type const> splits,
+                     rmm::cuda_stream_view stream)
 {
   if (splits.empty() or column_size == 0) { return std::vector<T>{input}; }
   CUDF_EXPECTS(splits.back() <= column_size, "splits can't exceed size of input columns");
@@ -38,24 +44,67 @@ std::vector<T> split(T const& input, size_type column_size, std::vector<size_typ
 
   indices.push_back(column_size);  // This to include rest of the elements
 
-  return cudf::slice(input, indices);
+  return detail::slice(input, indices, stream);
 }
+
 };  // anonymous namespace
 
 std::vector<cudf::column_view> split(cudf::column_view const& input,
-                                     std::vector<size_type> const& splits)
+                                     host_span<size_type const> splits,
+                                     rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
-  return split(input, input.size(), splits);
+  return split(input, input.size(), splits, stream);
 }
 
 std::vector<cudf::table_view> split(cudf::table_view const& input,
-                                    std::vector<size_type> const& splits)
+                                    host_span<size_type const> splits,
+                                    rmm::cuda_stream_view stream)
 {
-  CUDF_FUNC_RANGE();
   std::vector<table_view> result{};
   if (input.num_columns() == 0) { return result; }
-  return split(input, input.column(0).size(), splits);
+  return split(input, input.column(0).size(), splits, stream);
+}
+
+std::vector<column_view> split(column_view const& input,
+                               std::initializer_list<size_type> splits,
+                               rmm::cuda_stream_view stream)
+{
+  return split(input, host_span<size_type const>(splits.begin(), splits.size()), stream);
+}
+
+std::vector<table_view> split(table_view const& input,
+                              std::initializer_list<size_type> splits,
+                              rmm::cuda_stream_view stream)
+{
+  return detail::split(input, host_span<size_type const>(splits.begin(), splits.size()), stream);
+}
+
+}  // namespace detail
+
+std::vector<cudf::column_view> split(cudf::column_view const& input,
+                                     host_span<size_type const> splits)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split(input, splits, rmm::cuda_stream_default);
+}
+
+std::vector<cudf::table_view> split(cudf::table_view const& input,
+                                    host_span<size_type const> splits)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split(input, splits, rmm::cuda_stream_default);
+}
+
+std::vector<column_view> split(column_view const& input, std::initializer_list<size_type> splits)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split(input, splits, rmm::cuda_stream_default);
+}
+
+std::vector<table_view> split(table_view const& input, std::initializer_list<size_type> splits)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split(input, splits, rmm::cuda_stream_default);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 6e892b3e461..34106bef4ae 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/detail/datetime.hpp>
+#include <cudf/detail/datetime_ops.cuh>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -320,24 +321,10 @@ struct add_calendrical_months_functor {
                       timestamp_column.begin<Timestamp>(),
                       timestamp_column.end<Timestamp>(),
                       months_begin,
-                      output_mview.begin<Timestamp>(),
-                      [] __device__(auto time_val, auto months_val) {
-                        using namespace cuda::std::chrono;
-                        using duration_m = duration<int32_t, months::period>;
-
-                        // Get the days component from the input
-                        auto days_since_epoch = floor<days>(time_val);
-
-                        // Add the number of months
-                        year_month_day ymd{days_since_epoch};
-                        ymd += duration_m{months_val};
-
-                        // If the new date isn't valid, scale it back to the last day of the
-                        // month.
-                        if (!ymd.ok()) ymd = ymd.year() / ymd.month() / last;
-
-                        // Put back the time component to the date
-                        return sys_days{ymd} + (time_val - days_since_epoch);
+                      output->mutable_view().begin<Timestamp>(),
+                      [] __device__(auto& timestamp, auto& months) {
+                        return add_calendrical_months_with_scale_back(
+                          timestamp, cuda::std::chrono::months{months});
                       });
     return output;
   }
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index d701fb689cb..fb183859f0d 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -36,7 +36,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  if (source.is_empty()) return make_empty_column(data_type{type_id::EMPTY});
+  if (source.is_empty()) return make_empty_column(type_id::EMPTY);
 
   column_view indices{source.indices().type(),
                       source.size(),
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 35e7d5fbc27..d804a587478 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -53,7 +53,7 @@ std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls");
-  if (keys_column.is_empty()) return make_empty_column(data_type{type_id::DICTIONARY32});
+  if (keys_column.is_empty()) return make_empty_column(type_id::DICTIONARY32);
 
   auto keys_copy = std::make_unique<column>(keys_column, stream, mr);
   auto indices_copy =
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index 501e034c5fe..839b28413a6 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/column/column.hpp>
-#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/transform.hpp>
@@ -53,7 +53,9 @@ std::unique_ptr<column> encode(column_view const& input_column,
 
   if (keys_column->has_nulls()) {
     keys_column = std::make_unique<column>(
-      slice(keys_column->view(), std::vector<size_type>{0, keys_column->size() - 1}).front(),
+      cudf::detail::slice(
+        keys_column->view(), std::vector<size_type>{0, keys_column->size() - 1}, stream)
+        .front(),
       stream,
       mr);
     keys_column->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);  // remove the null-mask
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 11c81ee434b..4acc2d124b2 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -58,17 +58,14 @@ std::unique_ptr<column> replace_indices(column_view const& input,
   auto const d_input    = *input_view;
   auto predicate        = [d_input] __device__(auto i) { return d_input.is_valid(i); };
 
-  using Element = typename thrust::
-    tuple_element<0, typename thrust::iterator_traits<ReplacementIter>::value_type>::type;
-
-  auto input_pair_iterator = cudf::detail::indexalator_factory::make_input_pair_iterator(input);
+  auto input_iterator = cudf::detail::indexalator_factory::make_input_optional_iterator(input);
 
   return cudf::detail::copy_if_else(true,
-                                    input_pair_iterator,
-                                    input_pair_iterator + input.size(),
+                                    input_iterator,
+                                    input_iterator + input.size(),
                                     replacement_iter,
                                     predicate,
-                                    data_type{type_to_id<Element>()},
+                                    data_type{type_to_id<size_type>()},
                                     stream,
                                     mr);
 }
@@ -100,7 +97,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 
   auto new_indices =
     replace_indices(input_indices,
-                    cudf::detail::indexalator_factory::make_input_pair_iterator(repl_indices),
+                    cudf::detail::indexalator_factory::make_input_optional_iterator(repl_indices),
                     stream,
                     mr);
 
@@ -118,7 +115,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
-  if (!input.has_nulls() || !replacement.is_valid()) {
+  if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
   CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
@@ -133,7 +130,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   auto const input_indices = input_view.get_indices_annotated();
   auto new_indices =
     replace_indices(input_indices,
-                    cudf::detail::indexalator_factory::make_input_pair_iterator(*scalar_index),
+                    cudf::detail::indexalator_factory::make_input_optional_iterator(*scalar_index),
                     stream,
                     mr);
   new_indices->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 83be6a0bac0..88e0de23290 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -69,7 +69,7 @@ struct find_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    if (!key.is_valid())
+    if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
     CUDF_EXPECTS(input.keys().type() == key.type(),
                  "search key type must match dictionary keys type");
@@ -114,7 +114,7 @@ struct find_insert_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr) const
   {
-    if (!key.is_valid())
+    if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
     CUDF_EXPECTS(input.keys().type() == key.type(),
                  "search key type must match dictionary keys type");
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
new file mode 100644
index 00000000000..159679c5ed2
--- /dev/null
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/calendrical_month_sequence.cuh>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace detail {
+std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
+                                                         scalar const& init,
+                                                         size_type months,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr)
+{
+  return type_dispatcher(
+    init.type(), calendrical_month_sequence_functor{}, size, init, months, stream, mr);
+}
+}  // namespace detail
+
+std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
+                                                         scalar const& init,
+                                                         size_type months,
+                                                         rmm::mr::device_memory_resource* mr)
+{
+  return detail::calendrical_month_sequence(size, init, months, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index b62d2ed4f8f..749a4d7940c 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -74,9 +74,9 @@ struct in_place_fill_range_dispatch {
                                                                cudf::size_type end,
                                                                rmm::cuda_stream_view stream)
   {
-    auto unscaled = static_cast<cudf::fixed_point_scalar<T> const&>(value).value();
+    auto unscaled = static_cast<cudf::fixed_point_scalar<T> const&>(value).value(stream);
     using RepType = typename T::rep;
-    auto s        = cudf::numeric_scalar<RepType>(unscaled, value.is_valid());
+    auto s        = cudf::numeric_scalar<RepType>(unscaled, value.is_valid(stream));
     auto view     = cudf::bit_cast(destination, s.type());
     in_place_fill<RepType>(view, begin, end, s, stream);
   }
@@ -110,7 +110,7 @@ struct out_of_place_fill_range_dispatch {
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
 
     if (end != begin) {  // otherwise no fill
-      if (!p_ret->nullable() && !value.is_valid()) {
+      if (!p_ret->nullable() && !value.is_valid(stream)) {
         p_ret->set_null_mask(
           cudf::detail::create_null_mask(p_ret->size(), cudf::mask_state::ALL_VALID, stream, mr),
           0);
@@ -150,7 +150,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   CUDF_EXPECTS(target.keys().type() == value.type(), "Data type mismatch.");
 
   // if the scalar is invalid, then just copy the column and fill the null mask
-  if (!value.is_valid()) {
+  if (!value.is_valid(stream)) {
     auto result = std::make_unique<cudf::column>(input, stream, mr);
     auto mview  = result->mutable_view();
     cudf::detail::set_null_mask(mview.null_mask(), begin, end, false, stream);
@@ -210,7 +210,7 @@ void fill_in_place(mutable_column_view& destination,
                "In-place fill does not support variable-sized types.");
   CUDF_EXPECTS((begin >= 0) && (end <= destination.size()) && (begin <= end),
                "Range is out of bounds.");
-  CUDF_EXPECTS((destination.nullable() == true) || (value.is_valid() == true),
+  CUDF_EXPECTS((destination.nullable() == true) || (value.is_valid(stream) == true),
                "destination should be nullable or value should be non-null.");
   CUDF_EXPECTS(destination.type() == value.type(), "Data type mismatch.");
 
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 6a5e99026d3..af9fc300ed2 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -57,7 +57,7 @@ struct count_accessor {
 #else
     auto p_count = static_cast<ScalarType const*>(this->p_scalar);
 #endif
-    auto count = p_count->value();
+    auto count = p_count->value(stream);
     // static_cast is necessary due to bool
     CUDF_EXPECTS(static_cast<int64_t>(count) <= std::numeric_limits<cudf::size_type>::max(),
                  "count should not exceed size_type's limit.");
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index bdaccba38dc..e8b4a8b1cbf 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/groupby/group_replace_nulls.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/strings/string_view.hpp>
@@ -33,7 +34,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
-#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -76,8 +76,8 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::disp
   if (_keys_are_sorted == sorted::NO and not _helper and
       detail::hash::can_use_hash_groupby(_keys, requests)) {
     // Optionally flatten nested key columns.
-    auto [flattened_keys, _, __, ___] =
-      flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
+    auto flattened             = flatten_nested_columns(_keys, {}, {}, column_nullability::FORCE);
+    auto flattened_keys        = flattened.flattened_columns();
     auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
     CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
                  "Unsupported groupby key type does not support equality comparison");
@@ -114,7 +114,7 @@ struct empty_column_constructor {
 
     if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
       return make_lists_column(
-        0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(values), 0, {});
+        0, make_empty_column(type_to_id<offset_type>()), empty_like(values), 0, {});
     }
 
     // If `values` is LIST typed, and the aggregation results match the type,
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index b8150f7fd14..ef640256927 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -158,6 +158,17 @@ class groupby_simple_aggregations_collector final
 
     return aggs;
   }
+
+  std::vector<std::unique_ptr<aggregation>> visit(
+    data_type, cudf::detail::correlation_aggregation const&) override
+  {
+    std::vector<std::unique_ptr<aggregation>> aggs;
+    aggs.push_back(make_sum_aggregation());
+    // COUNT_VALID
+    aggs.push_back(make_count_aggregation());
+
+    return aggs;
+  }
 };
 
 template <typename Map>
@@ -621,11 +632,22 @@ std::unique_ptr<table> groupby_null_templated(table_view const& keys,
  */
 bool can_use_hash_groupby(table_view const& keys, host_span<aggregation_request const> requests)
 {
-  return std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
-    return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
-      return is_hash_aggregation(a->kind);
+  auto const all_hash_aggregations =
+    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+      return std::all_of(r.aggregations.begin(), r.aggregations.end(), [](auto const& a) {
+        return is_hash_aggregation(a->kind);
+      });
     });
-  });
+
+  // Currently, structs are not supported in any of hash-based aggregations.
+  // Therefore, if any request contains structs then we must fallback to sort-based aggregations.
+  // TODO: Support structs in hash-based aggregations.
+  auto const has_struct =
+    std::all_of(requests.begin(), requests.end(), [](aggregation_request const& r) {
+      return r.values.type().id() == type_id::STRUCT;
+    });
+
+  return all_hash_aggregations && !has_struct;
 }
 
 // Hash-based groupby
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 1547964f3f4..83c6c1bca57 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -26,6 +26,7 @@
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
@@ -235,11 +236,14 @@ void aggregate_result_functor::operator()<aggregation::MEAN>(aggregation const&
 
   // TODO (dm): Special case for timestamp. Add target_type_impl for it.
   //            Blocked until we support operator+ on timestamps
+  auto col_type = cudf::is_dictionary(values.type())
+                    ? cudf::dictionary_column_view(values).keys().type()
+                    : values.type();
   auto result =
     cudf::detail::binary_operation(sum_result,
                                    count_result,
                                    binary_operator::DIV,
-                                   cudf::detail::target_type(values.type(), aggregation::MEAN),
+                                   cudf::detail::target_type(col_type, aggregation::MEAN),
                                    stream,
                                    mr);
   cache.add_result(values, agg, std::move(result));
@@ -525,6 +529,141 @@ void aggregate_result_functor::operator()<aggregation::MERGE_M2>(aggregation con
       get_grouped_values(), helper.group_offsets(stream), helper.num_groups(stream), stream, mr));
 };
 
+/**
+ * @brief Creates column views with only valid elements in both input column views
+ *
+ * @param column_0 The first column
+ * @param column_1 The second column
+ * @return tuple with new null mask (if null masks of input differ) and new column views
+ */
+auto column_view_with_common_nulls(column_view const& column_0, column_view const& column_1)
+{
+  rmm::device_buffer new_nullmask = cudf::bitmask_and(table_view{{column_0, column_1}});
+  auto null_count                 = cudf::count_unset_bits(
+    static_cast<cudf::bitmask_type const*>(new_nullmask.data()), 0, column_0.size());
+  if (null_count == 0) { return std::make_tuple(std::move(new_nullmask), column_0, column_1); }
+  auto column_view_with_new_nullmask = [](auto const& col, void* nullmask, auto null_count) {
+    return column_view(col.type(),
+                       col.size(),
+                       col.head(),
+                       static_cast<cudf::bitmask_type const*>(nullmask),
+                       null_count,
+                       col.offset(),
+                       std::vector(col.child_begin(), col.child_end()));
+  };
+  auto new_column_0 = null_count == column_0.null_count()
+                        ? column_0
+                        : column_view_with_new_nullmask(column_0, new_nullmask.data(), null_count);
+  auto new_column_1 = null_count == column_1.null_count()
+                        ? column_1
+                        : column_view_with_new_nullmask(column_1, new_nullmask.data(), null_count);
+  return std::make_tuple(std::move(new_nullmask), new_column_0, new_column_1);
+}
+
+/**
+ * @brief Perform covariance betweeen two child columns of non-nullable struct column.
+ *
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::COVARIANCE>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+  CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
+               "Input to `groupby covariance` must be a structs column.");
+  CUDF_EXPECTS(values.num_children() == 2,
+               "Input to `groupby covariance` must be a structs column having 2 children columns.");
+
+  auto const& cov_agg = dynamic_cast<cudf::detail::covariance_aggregation const&>(agg);
+  // Covariance only for valid values in both columns.
+  // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT.
+  auto [_, values_child0, values_child1] =
+    column_view_with_common_nulls(values.child(0), values.child(1));
+
+  auto mean_agg = make_mean_aggregation();
+  aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()<aggregation::MEAN>(*mean_agg);
+  aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()<aggregation::MEAN>(*mean_agg);
+
+  auto const mean0 = cache.get_result(values_child0, *mean_agg);
+  auto const mean1 = cache.get_result(values_child1, *mean_agg);
+  auto count_agg   = make_count_aggregation();
+  auto const count = cache.get_result(values_child0, *count_agg);
+
+  cache.add_result(values,
+                   agg,
+                   detail::group_covariance(get_grouped_values().child(0),
+                                            get_grouped_values().child(1),
+                                            helper.group_labels(stream),
+                                            helper.num_groups(stream),
+                                            count,
+                                            mean0,
+                                            mean1,
+                                            cov_agg._min_periods,
+                                            cov_agg._ddof,
+                                            stream,
+                                            mr));
+};
+
+/**
+ * @brief Perform correlation betweeen two child columns of non-nullable struct column.
+ *
+ */
+template <>
+void aggregate_result_functor::operator()<aggregation::CORRELATION>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+  CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
+               "Input to `groupby correlation` must be a structs column.");
+  CUDF_EXPECTS(
+    values.num_children() == 2,
+    "Input to `groupby correlation` must be a structs column having 2 children columns.");
+  CUDF_EXPECTS(values.nullable() == false,
+               "Input to `groupby correlation` must be a non-nullable structs column.");
+
+  auto const& corr_agg = dynamic_cast<cudf::detail::correlation_aggregation const&>(agg);
+  CUDF_EXPECTS(corr_agg._type == correlation_type::PEARSON,
+               "Only Pearson correlation is supported.");
+
+  // Correlation only for valid values in both columns.
+  // in non-identical null mask cases, this prevents caching of the results - STD, MEAN, COUNT
+  auto [_, values_child0, values_child1] =
+    column_view_with_common_nulls(values.child(0), values.child(1));
+
+  auto std_agg = make_std_aggregation();
+  aggregate_result_functor(values_child0, helper, cache, stream, mr).operator()<aggregation::STD>(*std_agg);
+  aggregate_result_functor(values_child1, helper, cache, stream, mr).operator()<aggregation::STD>(*std_agg);
+
+  // Compute covariance here to avoid repeated computation of mean & count
+  auto cov_agg = make_covariance_aggregation(corr_agg._min_periods);
+  if (not cache.has_result(values, *cov_agg)) {
+    auto mean_agg    = make_mean_aggregation();
+    auto const mean0 = cache.get_result(values_child0, *mean_agg);
+    auto const mean1 = cache.get_result(values_child1, *mean_agg);
+    auto count_agg   = make_count_aggregation();
+    auto const count = cache.get_result(values_child0, *count_agg);
+
+    auto const& cov_agg_obj = dynamic_cast<cudf::detail::covariance_aggregation const&>(*cov_agg);
+    cache.add_result(values,
+                     *cov_agg,
+                     detail::group_covariance(get_grouped_values().child(0),
+                                              get_grouped_values().child(1),
+                                              helper.group_labels(stream),
+                                              helper.num_groups(stream),
+                                              count,
+                                              mean0,
+                                              mean1,
+                                              cov_agg_obj._min_periods,
+                                              cov_agg_obj._ddof,
+                                              stream,
+                                              mr));
+  }
+
+  auto const stddev0    = cache.get_result(values_child0, *std_agg);
+  auto const stddev1    = cache.get_result(values_child1, *std_agg);
+  auto const covariance = cache.get_result(values, *cov_agg);
+  cache.add_result(
+    values, agg, detail::group_correlation(covariance, stddev0, stddev1, stream, mr));
+}
+
 /**
  * @brief Generate a tdigest column from a grouped set of numeric input values.
  *
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index 6ce23ffc35b..466171ec80b 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -34,7 +34,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
-                                 reduce_functor<aggregation::ARGMAX>{},
+                                 group_reduction_dispatcher<aggregation::ARGMAX>{},
                                  values,
                                  num_groups,
                                  group_labels,
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index ab91c2c0d29..4f7b2b713e6 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -34,7 +34,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto indices = type_dispatcher(values.type(),
-                                 reduce_functor<aggregation::ARGMIN>{},
+                                 group_reduction_dispatcher<aggregation::ARGMIN>{},
                                  values,
                                  num_groups,
                                  group_labels,
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
new file mode 100644
index 00000000000..cdcf4311be7
--- /dev/null
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <groupby/sort/group_reductions.hpp>
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/valid_if.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/reduce.h>
+
+#include <type_traits>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+namespace {
+
+template <typename T>
+constexpr bool is_double_convertible()
+{
+  return std::is_convertible_v<T, double> || std::is_constructible_v<double, T>;
+}
+
+struct is_double_convertible_impl {
+  template <typename T>
+  bool operator()()
+  {
+    return is_double_convertible<T>();
+  }
+};
+
+/**
+ * @brief Typecasts each element of the column to `CastType`
+ */
+template <typename CastType>
+struct type_casted_accessor {
+  template <typename Element>
+  CUDA_DEVICE_CALLABLE CastType operator()(cudf::size_type i, column_device_view const& col) const
+  {
+    if constexpr (column_device_view::has_element_accessor<Element>() and
+                  std::is_convertible_v<Element, CastType>)
+      return static_cast<CastType>(col.element<Element>(i));
+    (void)i;
+    (void)col;
+    return {};
+  }
+};
+
+template <typename ResultType>
+struct covariance_transform {
+  column_device_view const d_values_0, d_values_1;
+  ResultType const *d_means_0, *d_means_1;
+  size_type const* d_group_sizes;
+  size_type const* d_group_labels;
+  size_type ddof{1};  // TODO update based on bias.
+
+  __device__ static ResultType value(column_device_view const& view, size_type i)
+  {
+    bool const is_dict = view.type().id() == type_id::DICTIONARY32;
+    i                  = is_dict ? static_cast<size_type>(view.element<dictionary32>(i)) : i;
+    auto values_col    = is_dict ? view.child(dictionary_column_view::keys_column_index) : view;
+    return type_dispatcher(values_col.type(), type_casted_accessor<ResultType>{}, i, values_col);
+  }
+
+  __device__ ResultType operator()(size_type i)
+  {
+    if (d_values_0.is_null(i) or d_values_1.is_null(i)) return 0.0;
+
+    // This has to be device dispatch because x and y type may differ
+    auto const x = value(d_values_0, i);
+    auto const y = value(d_values_1, i);
+
+    size_type const group_idx  = d_group_labels[i];
+    size_type const group_size = d_group_sizes[group_idx];
+
+    // prevent divide by zero error
+    if (group_size == 0 or group_size - ddof <= 0) return 0.0;
+
+    ResultType const xmean = d_means_0[group_idx];
+    ResultType const ymean = d_means_1[group_idx];
+    return (x - xmean) * (y - ymean) / (group_size - ddof);
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> group_covariance(column_view const& values_0,
+                                         column_view const& values_1,
+                                         cudf::device_span<size_type const> group_labels,
+                                         size_type num_groups,
+                                         column_view const& count,
+                                         column_view const& mean_0,
+                                         column_view const& mean_1,
+                                         size_type min_periods,
+                                         size_type ddof,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  using result_type = id_to_type<type_id::FLOAT64>;
+  static_assert(
+    std::is_same_v<cudf::detail::target_type_t<result_type, aggregation::Kind::CORRELATION>,
+                   result_type>);
+
+  // check if each child type can be converted to float64.
+  auto get_base_type = [](auto const& col) {
+    return (col.type().id() == type_id::DICTIONARY32
+              ? col.child(dictionary_column_view::keys_column_index)
+              : col)
+      .type();
+  };
+  bool const is_convertible =
+    type_dispatcher(get_base_type(values_0), is_double_convertible_impl{}) or
+    type_dispatcher(get_base_type(values_1), is_double_convertible_impl{});
+
+  CUDF_EXPECTS(is_convertible,
+               "Input to `group_correlation` must be columns of type convertible to float64.");
+
+  auto mean0_ptr = mean_0.begin<result_type>();
+  auto mean1_ptr = mean_1.begin<result_type>();
+
+  auto d_values_0 = column_device_view::create(values_0, stream);
+  auto d_values_1 = column_device_view::create(values_1, stream);
+  covariance_transform<result_type> covariance_transform_op{*d_values_0,
+                                                            *d_values_1,
+                                                            mean0_ptr,
+                                                            mean1_ptr,
+                                                            count.data<size_type>(),
+                                                            group_labels.begin(),
+                                                            ddof};
+
+  auto result = make_numeric_column(
+    data_type(type_to_id<result_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto d_result = result->mutable_view().begin<result_type>();
+
+  auto corr_iter =
+    thrust::make_transform_iterator(thrust::make_counting_iterator(0), covariance_transform_op);
+
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        corr_iter,
+                        thrust::make_discard_iterator(),
+                        d_result);
+
+  auto is_null = [ddof, min_periods] __device__(size_type group_size) {
+    return not(group_size == 0 or group_size - ddof <= 0 or group_size < min_periods);
+  };
+  auto [new_nullmask, null_count] =
+    cudf::detail::valid_if(count.begin<size_type>(), count.end<size_type>(), is_null, stream, mr);
+  if (null_count != 0) {
+    result->set_null_mask(std::move(new_nullmask));
+    result->set_null_count(null_count);
+  }
+  return result;
+}
+
+std::unique_ptr<column> group_correlation(column_view const& covariance,
+                                          column_view const& stddev_0,
+                                          column_view const& stddev_1,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  using result_type = id_to_type<type_id::FLOAT64>;
+  CUDF_EXPECTS(covariance.type().id() == type_id::FLOAT64,
+               "Covariance result as FLOAT64 is supported");
+  auto stddev0_ptr = stddev_0.begin<result_type>();
+  auto stddev1_ptr = stddev_1.begin<result_type>();
+  auto stddev_iter = thrust::make_zip_iterator(thrust::make_tuple(stddev0_ptr, stddev1_ptr));
+  auto result      = make_numeric_column(covariance.type(),
+                                    covariance.size(),
+                                    cudf::detail::copy_bitmask(covariance, stream, mr),
+                                    covariance.null_count(),
+                                    stream,
+                                    mr);
+  auto d_result    = result->mutable_view().begin<result_type>();
+  thrust::transform(rmm::exec_policy(stream),
+                    covariance.begin<result_type>(),
+                    covariance.end<result_type>(),
+                    stddev_iter,
+                    d_result,
+                    [] __device__(auto const covariance, auto const stddev) {
+                      return covariance / thrust::get<0>(stddev) / thrust::get<1>(stddev);
+                    });
+  return result;
+}
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 7dd0e43ad28..5da15266233 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -30,8 +30,13 @@ std::unique_ptr<column> group_max(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::MAX>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 303d606be9d..1551dc00a04 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), scan_functor<aggregation::MAX>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::MAX>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 4124ec0f6f6..c42a0b94de0 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -30,8 +30,13 @@ std::unique_ptr<column> group_min(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::MIN>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::MIN>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 4a692cdf0bd..daaeb6bb6f7 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), scan_functor<aggregation::MIN>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::MIN>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index e9cf8611b58..74f5cbed041 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -33,7 +33,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
   return type_dispatcher(values_type,
-                         reduce_functor<aggregation::PRODUCT>{},
+                         group_reduction_dispatcher<aggregation::PRODUCT>{},
                          values,
                          num_groups,
                          group_labels,
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 5f4dda294fd..935ef9554a9 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/span.hpp>
@@ -23,8 +24,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <structs/utilities.hpp>
-
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -53,14 +52,12 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr);
-  table_view const order_table{{std::get<0>(superimposed)}};
-  auto const flattener = cudf::structs::detail::flatten_nested_columns(
-    order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
-  auto const d_flat_order = table_device_view::create(std::get<0>(flattener), stream);
+  auto const flattened = cudf::structs::detail::flatten_nested_columns(
+    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+  auto const d_flat_order = table_device_view::create(flattened, stream);
   row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                       order_table.num_rows(),
+                                       flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
                                        stream,
                                        mr);
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 725ff8ef8b8..75708c7b01c 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -441,6 +441,47 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
+/**
+ * @brief Internal API to find covariance of child columns of a non-nullable struct column.
+ *
+ * @param values_0 The first grouped values column to compute covariance
+ * @param values_1 The second grouped values column to compute covariance
+ * @param group_labels ID of group that the corresponding value belongs to
+ * @param num_groups Number of groups.
+ * @param count The count of valid rows of the grouped values of both columns
+ * @param mean_0 The mean of the first grouped values column
+ * @param mean_1 The mean of the second grouped values column
+ * @param min_periods The minimum number of non-null rows required to consider the covariance
+ * @param ddof The delta degrees of freedom used in the calculation of the variance
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_covariance(column_view const& values_0,
+                                         column_view const& values_1,
+                                         cudf::device_span<size_type const> group_labels,
+                                         size_type num_groups,
+                                         column_view const& count,
+                                         column_view const& mean_0,
+                                         column_view const& mean_1,
+                                         size_type min_periods,
+                                         size_type ddof,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Internal API to find correlation from covariance and standard deviation.
+ *
+ * @param covariance The covariance of two grouped values columns
+ * @param stddev_0 The standard deviation of the first grouped values column
+ * @param stddev_1 The standard deviation of the second grouped values column
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> group_correlation(column_view const& covariance,
+                                          column_view const& stddev_0,
+                                          column_view const& stddev_1,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Generate a tdigest column from a grouped set of numeric input values.
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 53d05b0c48b..013ea924cce 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -16,17 +16,24 @@
 
 #pragma once
 
+#include <groupby/sort/group_util.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
@@ -35,26 +42,51 @@
 namespace cudf {
 namespace groupby {
 namespace detail {
+// Error case when no other overload or specialization is available
+template <aggregation::Kind K, typename T, typename Enable = void>
+struct group_scan_functor {
+  template <typename... Args>
+  static std::unique_ptr<column> invoke(Args&&...)
+  {
+    CUDF_FAIL("Unsupported groupby scan type-agg combination.");
+  }
+};
+
 template <aggregation::Kind K>
-struct scan_functor {
+struct group_scan_dispatcher {
   template <typename T>
-  static constexpr bool is_supported()
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    if (K == aggregation::SUM)
-      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
-    else if (K == aggregation::MIN or K == aggregation::MAX)
-      return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
-    else
-      return false;
+    return group_scan_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
+};
 
-  template <typename T>
-  std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& values,
-    size_type num_groups,
-    cudf::device_span<cudf::size_type const> group_labels,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+/**
+ * @brief Check if the given aggregation K with data type T is supported in groupby scan.
+ */
+template <aggregation::Kind K, typename T>
+static constexpr bool is_group_scan_supported()
+{
+  if (K == aggregation::SUM)
+    return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+  else if (K == aggregation::MIN or K == aggregation::MAX)
+    return not cudf::is_dictionary<T>() and
+           (is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>);
+  else
+    return false;
+}
+
+template <aggregation::Kind K, typename T>
+struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -76,36 +108,159 @@ struct scan_functor {
     auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
     auto values_view = column_device_view::create(values, stream);
 
-    if (values.has_nulls()) {
-      auto input = thrust::make_transform_iterator(
-        make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
-        thrust::identity<ResultDeviceType>{});
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                     group_labels.begin(),
                                     group_labels.end(),
-                                    input,
-                                    result_view->begin<ResultDeviceType>(),
+                                    inp_iter,
+                                    out_iter,
                                     thrust::equal_to<size_type>{},
-                                    OpType{});
+                                    binop);
+    };
+
+    if (values.has_nulls()) {
+      auto input = thrust::make_transform_iterator(
+        make_null_replacement_iterator(*values_view, OpType::template identity<DeviceType>()),
+        thrust::identity<ResultDeviceType>{});
+      do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
       result->set_null_mask(cudf::detail::copy_bitmask(values, stream));
     } else {
       auto input = thrust::make_transform_iterator(values_view->begin<DeviceType>(),
                                                    thrust::identity<ResultDeviceType>{});
+      do_scan(input, result_view->begin<ResultDeviceType>(), OpType{});
+    }
+    return result;
+  }
+};
+
+template <aggregation::Kind K>
+struct group_scan_functor<K,
+                          cudf::string_view,
+                          std::enable_if_t<is_group_scan_supported<K, cudf::string_view>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    using OpType = cudf::detail::corresponding_operator_t<K>;
+
+    if (values.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); }
+
+    // create an empty output vector we can fill with string_view instances
+    auto results_vector = rmm::device_uvector<string_view>(values.size(), stream);
+
+    auto values_view = column_device_view::create(values, stream);
+
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                     group_labels.begin(),
                                     group_labels.end(),
-                                    input,
-                                    result_view->begin<ResultDeviceType>(),
+                                    inp_iter,
+                                    out_iter,
                                     thrust::equal_to<size_type>{},
-                                    OpType{});
+                                    binop);
+    };
+
+    if (values.has_nulls()) {
+      auto input = make_null_replacement_iterator(
+        *values_view, OpType::template identity<string_view>(), values.has_nulls());
+      do_scan(input, results_vector.begin(), OpType{});
+    } else {
+      do_scan(values_view->begin<string_view>(), results_vector.begin(), OpType{});
     }
-    return result;
+
+    // turn the string_view vector into a strings column
+    auto results = make_strings_column(results_vector, string_view{}, stream, mr);
+    if (values.has_nulls())
+      results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
+    return results;
   }
+};
 
-  template <typename T, typename... Args>
-  std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
+template <aggregation::Kind K>
+struct group_scan_functor<K,
+                          cudf::struct_view,
+                          std::enable_if_t<is_group_scan_supported<K, cudf::struct_view>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
-    CUDF_FAIL("Unsupported groupby scan type-agg combination");
+    if (values.is_empty()) { return cudf::empty_like(values); }
+
+    // When finding MIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite when finding MAX.
+    auto const null_precedence  = (K == aggregation::MIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::MIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
+
+    // Create a gather map contaning indices of the prefix min/max elements.
+    auto gather_map      = rmm::device_uvector<size_type>(values.size(), stream);
+    auto const map_begin = gather_map.begin();
+
+    // Perform segmented scan.
+    auto const do_scan = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    inp_iter,
+                                    out_iter,
+                                    thrust::equal_to<size_type>{},
+                                    binop);
+    };
+
+    // Find the indices of the prefix min/max elements within each group.
+    auto const count_iter = thrust::make_counting_iterator<size_type>(0);
+    if (values.has_nulls()) {
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
+    } else {
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::MIN);
+      do_scan(count_iter, map_begin, binop);
+    }
+
+    auto gather_map_view =
+      column_view(data_type{type_to_id<offset_type>()}, gather_map.size(), gather_map.data());
+
+    // Gather the children elements of the prefix min/max struct elements first.
+    auto scanned_children =
+      cudf::detail::gather(
+        table_view(std::vector<column_view>{values.child_begin(), values.child_end()}),
+        gather_map_view,
+        cudf::out_of_bounds_policy::DONT_CHECK,
+        cudf::detail::negative_index_policy::NOT_ALLOWED,
+        stream,
+        mr)
+        ->release();
+
+    // After gathering the children elements, we need to push down nulls from the root structs
+    // column to them.
+    if (values.has_nulls()) {
+      for (std::unique_ptr<column>& child : scanned_children) {
+        structs::detail::superimpose_parent_nulls(
+          values.null_mask(), values.null_count(), *child, stream, mr);
+      }
+    }
+
+    return make_structs_column(values.size(),
+                               std::move(scanned_children),
+                               values.null_count(),
+                               cudf::detail::copy_bitmask(values, stream, mr));
   }
 };
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index db2ae5b5d8e..4e0820af236 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -16,12 +16,17 @@
 
 #pragma once
 
+#include <groupby/sort/group_util.cuh>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -38,40 +43,33 @@ namespace groupby {
 namespace detail {
 
 /**
- * @brief ArgMin binary operator with index values into input column.
+ * @brief Binary operator with index values into the input column.
  *
  * @tparam T Type of the underlying column. Must support '<' operator.
  */
 template <typename T>
-struct ArgMin {
+struct element_arg_minmax_fn {
   column_device_view const d_col;
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
-  {
-    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
-    // github.com/NVIDIA/thrust/issues/1525
-    // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
-    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
-    return d_col.element<T>(lhs) < d_col.element<T>(rhs) ? lhs : rhs;
-  }
-};
+  bool const has_nulls;
+  bool const arg_min;
 
-/**
- * @brief ArgMax binary operator with index values into input column.
- *
- * @tparam T Type of the underlying column. Must support '<' operator.
- */
-template <typename T>
-struct ArgMax {
-  column_device_view const d_col;
-  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs, size_type const& rhs) const
+  CUDA_DEVICE_CALLABLE auto operator()(size_type const& lhs_idx, size_type const& rhs_idx) const
   {
     // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
     // github.com/NVIDIA/thrust/issues/1525
     // where invalid random values may be passed here by thrust::reduce_by_key
-    if (lhs < 0 || lhs >= d_col.size() || d_col.is_null(lhs)) { return rhs; }
-    if (rhs < 0 || rhs >= d_col.size() || d_col.is_null(rhs)) { return lhs; }
-    return d_col.element<T>(rhs) < d_col.element<T>(lhs) ? lhs : rhs;
+    if (lhs_idx < 0 || lhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(lhs_idx))) {
+      return rhs_idx;
+    }
+    if (rhs_idx < 0 || rhs_idx >= d_col.size() || (has_nulls && d_col.is_null_nocheck(rhs_idx))) {
+      return lhs_idx;
+    }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    auto const less = d_col.element<T>(lhs_idx) < d_col.element<T>(rhs_idx);
+    return less == arg_min ? lhs_idx : rhs_idx;
   }
 };
 
@@ -121,34 +119,58 @@ struct null_replaced_value_accessor : value_accessor<T> {
   }
 };
 
+// Error case when no other overload or specialization is available
+template <aggregation::Kind K, typename T, typename Enable = void>
+struct group_reduction_functor {
+  template <typename... Args>
+  static std::unique_ptr<column> invoke(Args&&...)
+  {
+    CUDF_FAIL("Unsupported groupby reduction type-agg combination.");
+  }
+};
+
 template <aggregation::Kind K>
-struct reduce_functor {
+struct group_reduction_dispatcher {
   template <typename T>
-  static constexpr bool is_supported()
+  std::unique_ptr<column> operator()(column_view const& values,
+                                     size_type num_groups,
+                                     cudf::device_span<cudf::size_type const> group_labels,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
   {
-    switch (K) {
-      case aggregation::SUM:
-        return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
-      case aggregation::PRODUCT: return cudf::detail::is_product_supported<T>();
-      case aggregation::MIN:
-      case aggregation::MAX:
-        return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
-      case aggregation::ARGMIN:
-      case aggregation::ARGMAX: return is_relationally_comparable<T, T>();
-      default: return false;
-    }
+    return group_reduction_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
+};
+
+/**
+ * @brief Check if the given aggregation K with data type T is supported in groupby reduction.
+ */
+template <aggregation::Kind K, typename T>
+static constexpr bool is_group_reduction_supported()
+{
+  switch (K) {
+    case aggregation::SUM:
+      return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
+    case aggregation::PRODUCT: return cudf::detail::is_product_supported<T>();
+    case aggregation::MIN:
+    case aggregation::MAX: return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
+    case aggregation::ARGMIN:
+    case aggregation::ARGMAX:
+      return is_relationally_comparable<T, T>() or std::is_same_v<T, cudf::struct_view>;
+    default: return false;
+  }
+}
+
+template <aggregation::Kind K, typename T>
+struct group_reduction_functor<K, T, std::enable_if_t<is_group_reduction_supported<K, T>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 
-  template <typename T>
-  std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& values,
-    size_type num_groups,
-    cudf::device_span<size_type const> group_labels,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
   {
     using DeviceType  = device_storage_type_t<T>;
-    using OpType      = cudf::detail::corresponding_operator_t<K>;
     using ResultType  = cudf::detail::target_type_t<T, K>;
     using ResultDType = device_storage_type_t<ResultType>;
 
@@ -161,55 +183,119 @@ struct reduce_functor {
 
     if (values.is_empty()) { return result; }
 
-    auto resultview = mutable_column_device_view::create(result->mutable_view(), stream);
-    auto valuesview = column_device_view::create(values, stream);
-    if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
-      using OpType =
-        std::conditional_t<(K == aggregation::ARGMAX), ArgMax<DeviceType>, ArgMin<DeviceType>>;
+    // Perform segmented reduction.
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            thrust::make_counting_iterator<ResultType>(0),
+                            inp_iter,
                             thrust::make_discard_iterator(),
-                            resultview->begin<ResultType>(),
-                            thrust::equal_to<ResultType>{},
-                            OpType{*valuesview});
-    } else {
-      auto init  = OpType::template identity<DeviceType>();
-      auto begin = cudf::detail::make_counting_transform_iterator(
-        0, null_replaced_value_accessor{*valuesview, init, values.has_nulls()});
-      thrust::reduce_by_key(rmm::exec_policy(stream),
-                            group_labels.data(),
-                            group_labels.data() + group_labels.size(),
-                            begin,
-                            thrust::make_discard_iterator(),
-                            resultview->begin<ResultDType>(),
+                            out_iter,
                             thrust::equal_to<size_type>{},
-                            OpType{});
+                            binop);
+    };
+
+    auto const d_values_ptr = column_device_view::create(values, stream);
+    auto const result_begin = result->mutable_view().template begin<ResultDType>();
+
+    if constexpr (K == aggregation::ARGMAX || K == aggregation::ARGMIN) {
+      auto const count_iter = thrust::make_counting_iterator<ResultType>(0);
+      auto const binop =
+        element_arg_minmax_fn<T>{*d_values_ptr, values.has_nulls(), K == aggregation::ARGMIN};
+      do_reduction(count_iter, result_begin, binop);
+    } else {
+      using OpType    = cudf::detail::corresponding_operator_t<K>;
+      auto init       = OpType::template identity<DeviceType>();
+      auto inp_values = cudf::detail::make_counting_transform_iterator(
+        0, null_replaced_value_accessor{*d_values_ptr, init, values.has_nulls()});
+      do_reduction(inp_values, result_begin, OpType{});
     }
 
     if (values.has_nulls()) {
       rmm::device_uvector<bool> validity(num_groups, stream);
+      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
+                   validity.begin(),
+                   thrust::logical_or<bool>{});
+
+      auto [null_mask, null_count] = cudf::detail::valid_if(
+        validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
+      result->set_null_mask(std::move(null_mask), null_count);
+    }
+    return result;
+  }
+};
+
+template <aggregation::Kind K>
+struct group_reduction_functor<
+  K,
+  cudf::struct_view,
+  std::enable_if_t<is_group_reduction_supported<K, cudf::struct_view>()>> {
+  static std::unique_ptr<column> invoke(column_view const& values,
+                                        size_type num_groups,
+                                        cudf::device_span<cudf::size_type const> group_labels,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+  {
+    // This is be expected to be size_type.
+    using ResultType = cudf::detail::target_type_t<cudf::struct_view, K>;
+
+    auto result = make_fixed_width_column(
+      data_type{type_to_id<ResultType>()}, num_groups, mask_state::UNALLOCATED, stream, mr);
+
+    if (values.is_empty()) { return result; }
+
+    // When finding ARGMIN, we need to consider nulls as larger than non-null elements.
+    // Thing is opposite for ARGMAX.
+    auto const null_precedence =
+      (K == aggregation::ARGMIN) ? null_order::AFTER : null_order::BEFORE;
+    auto const flattened_values = structs::detail::flatten_nested_columns(
+      table_view{{values}}, {}, std::vector<null_order>{null_precedence});
+    auto const d_flattened_values_ptr = table_device_view::create(flattened_values, stream);
+    auto const flattened_null_precedences =
+      (K == aggregation::ARGMIN)
+        ? cudf::detail::make_device_uvector_async(flattened_values.null_orders(), stream)
+        : rmm::device_uvector<null_order>(0, stream);
+
+    // Perform segmented reduction to find ARGMIN/ARGMAX.
+    auto const do_reduction = [&](auto const& inp_iter, auto const& out_iter, auto const& binop) {
       thrust::reduce_by_key(rmm::exec_policy(stream),
                             group_labels.data(),
                             group_labels.data() + group_labels.size(),
-                            cudf::detail::make_validity_iterator(*valuesview),
+                            inp_iter,
                             thrust::make_discard_iterator(),
-                            validity.begin(),
+                            out_iter,
                             thrust::equal_to<size_type>{},
-                            thrust::logical_or<bool>{});
+                            binop);
+    };
+
+    auto const count_iter   = thrust::make_counting_iterator<ResultType>(0);
+    auto const result_begin = result->mutable_view().template begin<ResultType>();
+    if (values.has_nulls()) {
+      auto const binop = row_arg_minmax_fn<true>(values.size(),
+                                                 *d_flattened_values_ptr,
+                                                 flattened_null_precedences.data(),
+                                                 K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
+
+      // Generate bitmask for the output by segmented reduction of the input bitmask.
+      auto const d_values_ptr = column_device_view::create(values, stream);
+      auto validity           = rmm::device_uvector<bool>(num_groups, stream);
+      do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
+                   validity.begin(),
+                   thrust::logical_or<bool>{});
+
       auto [null_mask, null_count] = cudf::detail::valid_if(
         validity.begin(), validity.end(), thrust::identity<bool>{}, stream, mr);
-      result->set_null_mask(std::move(null_mask));
-      result->set_null_count(null_count);
+      result->set_null_mask(std::move(null_mask), null_count);
+    } else {
+      auto const binop = row_arg_minmax_fn<false>(values.size(),
+                                                  *d_flattened_values_ptr,
+                                                  flattened_null_precedences.data(),
+                                                  K == aggregation::ARGMIN);
+      do_reduction(count_iter, result_begin, binop);
     }
-    return result;
-  }
 
-  template <typename T, typename... Args>
-  std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
-  {
-    CUDF_FAIL("Unsupported type-agg combination");
+    return result;
   }
 };
 
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index e9e6e985c54..e3c2ce7c864 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -32,8 +32,13 @@ std::unique_ptr<column> group_sum(column_view const& values,
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
                        : values.type();
-  return type_dispatcher(
-    values_type, reduce_functor<aggregation::SUM>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values_type,
+                         group_reduction_dispatcher<aggregation::SUM>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index ae9b1c321d4..632fde3b9d5 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -27,8 +27,13 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(
-    values.type(), scan_functor<aggregation::SUM>{}, values, num_groups, group_labels, stream, mr);
+  return type_dispatcher(values.type(),
+                         group_scan_dispatcher<aggregation::SUM>{},
+                         values,
+                         num_groups,
+                         group_labels,
+                         stream,
+                         mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_tdigest.cu b/cpp/src/groupby/sort/group_tdigest.cu
index 0738e4c5730..146a6a8c31c 100644
--- a/cpp/src/groupby/sort/group_tdigest.cu
+++ b/cpp/src/groupby/sort/group_tdigest.cu
@@ -25,10 +25,10 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -40,6 +40,8 @@ namespace cudf {
 namespace groupby {
 namespace detail {
 
+using namespace cudf::tdigest;
+
 namespace {
 
 // the most representative point within a cluster of similar
@@ -55,7 +57,10 @@ struct make_centroid {
 
   centroid operator() __device__(size_type index)
   {
-    return {static_cast<double>(col.element<T>(index)), 1, col.is_valid(index)};
+    auto const is_valid = col.is_valid(index);
+    auto const mean     = is_valid ? static_cast<double>(col.element<T>(index)) : 0.0;
+    auto const weight   = is_valid ? 1.0 : 0.0;
+    return {mean, weight, is_valid};
   }
 };
 
@@ -119,6 +124,9 @@ struct nearest_value_centroid_weights {
     auto const tdigest_begin = outer_offsets[group_index];
     auto const tdigest_end   = outer_offsets[group_index + 1];
     auto const num_weights   = inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    // NOTE: as it is today, this functor will never be called for any digests that are empty, but
+    // I'll leave this check here for safety.
+    if (num_weights == 0) { return thrust::pair<double, int>{0, 0}; }
     double const* group_cumulative_weights = cumulative_weights + inner_offsets[tdigest_begin];
 
     auto const index = ((thrust::lower_bound(thrust::seq,
@@ -179,6 +187,24 @@ struct cumulative_centroid_weight {
   }
 };
 
+struct tdigest_min {
+  __device__ double operator()(thrust::tuple<double, size_type> const& t)
+  {
+    auto const min  = thrust::get<0>(t);
+    auto const size = thrust::get<1>(t);
+    return size > 0 ? min : std::numeric_limits<double>::max();
+  }
+};
+
+struct tdigest_max {
+  __device__ double operator()(thrust::tuple<double, size_type> const& t)
+  {
+    auto const max  = thrust::get<0>(t);
+    auto const size = thrust::get<1>(t);
+    return size > 0 ? max : std::numeric_limits<double>::lowest();
+  }
+};
+
 // a monotonically increasing scale function which produces a distribution
 // of centroids that is more densely packed in the middle of the input
 // than at the ends.
@@ -214,6 +240,7 @@ __device__ double scale_func_k1(double quantile, double delta_norm)
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
+ * @param has_nulls Whether or not the input contains nulls
  *
  */
 template <typename TotalWeightIter, typename NearestWeightFunc, typename CumulativeWeight>
@@ -224,24 +251,33 @@ __global__ void generate_cluster_limits_kernel(int delta_,
                                                CumulativeWeight cumulative_weight,
                                                double* group_cluster_wl,
                                                size_type* group_num_clusters,
-                                               offset_type const* group_cluster_offsets)
+                                               offset_type const* group_cluster_offsets,
+                                               bool has_nulls)
 {
   int const tid          = threadIdx.x + blockIdx.x * blockDim.x;
   auto const group_index = tid;
   if (group_index >= num_groups) { return; }
 
   // we will generate at most delta clusters.
-  double const delta              = static_cast<double>(delta_);
-  double const delta_norm         = delta / (2.0 * M_PI);
-  double const total_weight       = total_weight_[group_index];
-  group_num_clusters[group_index] = 0;
-  // a group with nothing in it.
-  if (total_weight <= 0) { return; }
+  double const delta        = static_cast<double>(delta_);
+  double const delta_norm   = delta / (2.0 * M_PI);
+  double const total_weight = total_weight_[group_index];
 
   // start at the correct place based on our cluster offset.
   double* cluster_wl =
     group_cluster_wl ? group_cluster_wl + group_cluster_offsets[group_index] : nullptr;
 
+  // a group with nothing in it.
+  group_num_clusters[group_index] = 0;
+  if (total_weight <= 0) {
+    // if the input contains nulls we can potentially have a group that generates no
+    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
+    // in the tdigest generation step will need a location to store the unused reduction value for
+    // that group of nulls. these "stubs" will be postprocessed out afterwards.
+    if (has_nulls) { group_num_clusters[group_index] = 1; }
+    return;
+  }
+
   double cur_limit        = 0.0;
   double cur_weight       = 0.0;
   double next_limit       = -1.0;
@@ -326,6 +362,7 @@ __global__ void generate_cluster_limits_kernel(int delta_,
  * stream that falls before our current cluster limit
  * @param total_weight       A functor which returns the expected total weight for
  * the entire stream of input values for the specified group.
+ * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -339,6 +376,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             TotalWeightIter total_weight,
                             CumulativeWeight cumulative_weight,
+                            bool has_nulls,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
@@ -356,10 +394,11 @@ generate_group_cluster_info(int delta,
     cumulative_weight,
     nullptr,
     group_num_clusters.begin(),
-    nullptr);
+    nullptr,
+    has_nulls);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
-  auto group_cluster_offsets = cudf::make_fixed_width_column(
+  auto group_cluster_offsets = cudf::make_numeric_column(
     data_type{type_id::INT32}, num_groups + 1, mask_state::UNALLOCATED, stream, mr);
   auto cluster_size = cudf::detail::make_counting_transform_iterator(
     0, [group_num_clusters = group_num_clusters.begin(), num_groups] __device__(size_type index) {
@@ -385,13 +424,96 @@ generate_group_cluster_info(int delta,
     cumulative_weight,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
-    group_cluster_offsets->view().begin<offset_type>());
+    group_cluster_offsets->view().begin<offset_type>(),
+    has_nulls);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
           static_cast<size_type>(total_clusters)};
 }
 
+std::unique_ptr<column> build_output_column(size_type num_rows,
+                                            std::unique_ptr<column>&& means,
+                                            std::unique_ptr<column>&& weights,
+                                            std::unique_ptr<column>&& offsets,
+                                            std::unique_ptr<column>&& min_col,
+                                            std::unique_ptr<column>&& max_col,
+                                            bool has_nulls,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
+{
+  // whether or not this weight is a stub
+  auto is_stub_weight = [weights = weights->view().begin<double>()] __device__(size_type i) {
+    return weights[i] == 0;
+  };
+  // whether or not this particular tdigest is a stub
+  auto is_stub_digest = [offsets = offsets->view().begin<offset_type>(), is_stub_weight] __device__(
+                          size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
+
+  size_type const num_stubs = [&]() {
+    if (!has_nulls) { return 0; }
+    auto iter = cudf::detail::make_counting_transform_iterator(0, is_stub_digest);
+    return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
+  }();
+
+  // if there are no stub tdigests, we can return immediately.
+  if (num_stubs == 0) {
+    return cudf::detail::tdigest::make_tdigest_column(num_rows,
+                                                      std::move(means),
+                                                      std::move(weights),
+                                                      std::move(offsets),
+                                                      std::move(min_col),
+                                                      std::move(max_col),
+                                                      stream,
+                                                      mr);
+  }
+
+  // otherwise we need to strip out the stubs.
+  auto remove_stubs = [&](column_view const& col, size_type num_stubs) {
+    auto result = cudf::make_numeric_column(
+      data_type{type_id::FLOAT64}, col.size() - num_stubs, mask_state::UNALLOCATED, stream, mr);
+    thrust::remove_copy_if(rmm::exec_policy(stream),
+                           col.begin<double>(),
+                           col.end<double>(),
+                           thrust::make_counting_iterator(0),
+                           result->mutable_view().begin<double>(),
+                           is_stub_weight);
+    return result;
+  };
+  // remove from the means and weights column
+  auto _means   = remove_stubs(*means, num_stubs);
+  auto _weights = remove_stubs(*weights, num_stubs);
+
+  // adjust offsets.
+  rmm::device_uvector<offset_type> sizes(num_rows, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(0),
+                    thrust::make_counting_iterator(0) + num_rows,
+                    sizes.begin(),
+                    [offsets = offsets->view().begin<offset_type>()] __device__(size_type i) {
+                      return offsets[i + 1] - offsets[i];
+                    });
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [sizes = sizes.begin(), is_stub_digest, num_rows] __device__(size_type i) {
+      return i == num_rows || is_stub_digest(i) ? 0 : sizes[i];
+    });
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         iter,
+                         iter + num_rows + 1,
+                         offsets->mutable_view().begin<offset_type>(),
+                         0);
+
+  // assemble final column
+  return cudf::detail::tdigest::make_tdigest_column(num_rows,
+                                                    std::move(_means),
+                                                    std::move(_weights),
+                                                    std::move(offsets),
+                                                    std::move(min_col),
+                                                    std::move(max_col),
+                                                    stream,
+                                                    mr);
+}
+
 /**
  * @brief Compute a column of tdigests.
  *
@@ -413,6 +535,7 @@ generate_group_cluster_info(int delta,
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
+ * @param has_nulls          Whether or not the input contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -428,10 +551,11 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
+                                         bool has_nulls,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  // the output for each group is column of data that represents the tdigest. since we want 1 row
+  // the output for each group is a column of data that represents the tdigest. since we want 1 row
   // per group, each row will be a list the length of the tdigest for that group. so our output
   // column is of the form:
   // struct {
@@ -446,18 +570,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  //
   if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
-  std::vector<std::unique_ptr<column>> inner_children;
-  // mean
-  inner_children.push_back(cudf::make_fixed_width_column(
-    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
-  // weight
-  inner_children.push_back(cudf::make_fixed_width_column(
-    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr));
-  // tdigest struct
-  auto tdigests =
-    cudf::make_structs_column(total_clusters, std::move(inner_children), 0, {}, stream, mr);
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -469,14 +582,17 @@ std::unique_ptr<column> compute_tdigests(int delta,
      group_cluster_wl      = group_cluster_wl.data(),
      group_cluster_offsets = group_cluster_offsets->view().begin<offset_type>(),
      group_cumulative_weight] __device__(size_type value_index) -> size_type {
+      // get group index, relative value index within the group and cumulative weight.
       auto [group_index, relative_value_index, cumulative_weight] =
         group_cumulative_weight(value_index);
       (void)relative_value_index;
 
-      // compute start of cluster weight limits for this group
-      double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
       auto const num_clusters =
         group_cluster_offsets[group_index + 1] - group_cluster_offsets[group_index];
+      if (num_clusters == 0) { return group_cluster_offsets[group_index]; }
+
+      // compute start of cluster weight limits for this group
+      double const* weight_limits = group_cluster_wl + group_cluster_offsets[group_index];
 
       // local cluster index
       size_type const group_cluster_index =
@@ -490,11 +606,16 @@ std::unique_ptr<column> compute_tdigests(int delta,
       return group_cluster_index + group_cluster_offsets[group_index];
     });
 
+  // mean and weight data
+  auto centroid_means = cudf::make_numeric_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr);
+  auto centroid_weights = cudf::make_numeric_column(
+    data_type{type_id::FLOAT64}, total_clusters, mask_state::UNALLOCATED, stream, mr);
   // reduce the centroids down by key.
-  cudf::mutable_column_view mean_col =
-    tdigests->child(cudf::detail::tdigest::mean_column_index).mutable_view();
-  cudf::mutable_column_view weight_col =
-    tdigests->child(cudf::detail::tdigest::weight_column_index).mutable_view();
+  cudf::mutable_column_view mean_col(*centroid_means);
+  cudf::mutable_column_view weight_col(*centroid_weights);
+
+  // reduce the centroids into the clusters
   auto output           = thrust::make_zip_iterator(thrust::make_tuple(
     mean_col.begin<double>(), weight_col.begin<double>(), thrust::make_discard_iterator()));
   auto const num_values = std::distance(centroids_begin, centroids_end);
@@ -507,17 +628,16 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         thrust::equal_to<size_type>{},    // key equality check
                         merge_centroids{});
 
-  // create the list
-  auto const num_groups = group_cluster_offsets->size() - 1;
-  auto list             = cudf::make_lists_column(
-    num_groups, std::move(group_cluster_offsets), std::move(tdigests), 0, {});
-
   // create final tdigest column
-  std::vector<std::unique_ptr<column>> children;
-  children.push_back(std::move(list));
-  children.push_back(std::move(min_col));
-  children.push_back(std::move(max_col));
-  return make_structs_column(num_groups, std::move(children), 0, {}, stream, mr);
+  return build_output_column(group_cluster_offsets->size() - 1,
+                             std::move(centroid_means),
+                             std::move(centroid_weights),
+                             std::move(group_cluster_offsets),
+                             std::move(min_col),
+                             std::move(max_col),
+                             has_nulls,
+                             stream,
+                             mr);
 }
 
 // retrieve total weight of scalar inputs by group index
@@ -566,6 +686,7 @@ struct typed_group_tdigest {
                                   nearest_value_scalar_weights{},
                                   total_weight,
                                   cumulative_scalar_weight{group_offsets, group_labels},
+                                  col.null_count() > 0,
                                   stream,
                                   mr);
 
@@ -574,9 +695,9 @@ struct typed_group_tdigest {
     auto d_col = cudf::column_device_view::create(col);
 
     // compute min and max columns
-    auto min_col = cudf::make_fixed_width_column(
+    auto min_col = cudf::make_numeric_column(
       data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
-    auto max_col = cudf::make_fixed_width_column(
+    auto max_col = cudf::make_numeric_column(
       data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
     thrust::transform(
       rmm::exec_policy(stream),
@@ -600,6 +721,7 @@ struct typed_group_tdigest {
                             group_cluster_wl,
                             std::move(group_cluster_offsets),
                             total_clusters,
+                            col.null_count() > 0,
                             stream,
                             mr);
   }
@@ -648,22 +770,12 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
-  cudf::detail::tdigest::check_is_valid_tdigest_column(input);
+  tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
     return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
   }
 
-  structs_column_view scv(input);
-  lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
-  // ideally, we would just call .parent().child() here because tdigests cannot be
-  // sliced. however, lists_column_view() hides that particular interface. However,
-  // for the same reason, get_sliced_child() should be just as cheap.
-  auto data = lcv.get_sliced_child(stream);
-  structs_column_view tdigest(data);
-  auto mean   = tdigest.child(cudf::detail::tdigest::mean_column_index);
-  auto weight = tdigest.child(cudf::detail::tdigest::weight_column_index);
-
   // first step is to merge all the tdigests in each group. at the moment the only way to
   // make this work is to retrieve the group sizes (via group_offsets) and the individual digest
   // sizes (via input.offsets()) to the gpu and do the merges.  The scale problem is that while the
@@ -685,7 +797,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                   stream);
 
   // bring tdigest offsets back to the host
-  auto tdigest_offsets = lcv.offsets();
+  auto tdigest_offsets = tdv.centroids().offsets();
   std::vector<size_type> h_inner_offsets(tdigest_offsets.size());
   cudaMemcpyAsync(h_inner_offsets.data(),
                   tdigest_offsets.begin<size_type>(),
@@ -696,7 +808,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   stream.synchronize();
 
   // extract all means and weights into a table
-  cudf::table_view tdigests_unsliced({mean, weight});
+  cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()});
 
   // generate the merged (but not yet compressed) tdigests for each group.
   std::vector<std::unique_ptr<table>> tdigests;
@@ -727,30 +839,59 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     });
 
   // generate min and max values
-  auto min_col        = scv.child(cudf::detail::tdigest::min_column_index);
-  auto merged_min_col = cudf::make_fixed_width_column(
+  auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto min_iter = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(thrust::make_tuple(tdv.min_begin(), tdv.size_begin())),
+    tdigest_min{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        min_col.begin<double>(),
+                        min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
                         thrust::equal_to<size_type>{},  // key equality check
                         thrust::minimum<double>{});
 
-  auto max_col        = scv.child(cudf::detail::tdigest::max_column_index);
-  auto merged_max_col = cudf::make_fixed_width_column(
+  auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
+  auto max_iter = thrust::make_transform_iterator(
+    thrust::make_zip_iterator(thrust::make_tuple(tdv.max_begin(), tdv.size_begin())),
+    tdigest_max{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels.begin(),
                         group_labels.end(),
-                        max_col.begin<double>(),
+                        max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
                         thrust::equal_to<size_type>{},  // key equality check
                         thrust::maximum<double>{});
 
+  // for any empty groups, set the min and max to be 0. not technically necessary but it makes
+  // testing simpler.
+  auto group_num_weights = cudf::detail::make_counting_transform_iterator(
+    0,
+    [outer_offsets = group_offsets.data(),
+     inner_offsets =
+       tdigest_offsets.begin<size_type>()] __device__(size_type group_index) -> size_type {
+      auto const tdigest_begin = outer_offsets[group_index];
+      auto const tdigest_end   = outer_offsets[group_index + 1];
+      return inner_offsets[tdigest_end] - inner_offsets[tdigest_begin];
+    });
+  auto group_is_empty = [] __device__(size_type group_size) { return group_size == 0; };
+  thrust::replace_if(rmm::exec_policy(stream),
+                     merged_min_col->mutable_view().begin<double>(),
+                     merged_min_col->mutable_view().end<double>(),
+                     group_num_weights,
+                     group_is_empty,
+                     0);
+  thrust::replace_if(rmm::exec_policy(stream),
+                     merged_max_col->mutable_view().begin<double>(),
+                     merged_max_col->mutable_view().end<double>(),
+                     group_num_weights,
+                     group_is_empty,
+                     0);
+
   // concatenate all the merged tdigests back into one table.
   std::vector<table_view> tdigest_views;
   tdigest_views.reserve(num_groups);
@@ -761,8 +902,8 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   auto merged = cudf::detail::concatenate(tdigest_views, stream, mr);
 
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(cudf::detail::tdigest::weight_column_index).view();
-  auto cumulative_weights = cudf::make_fixed_width_column(
+  auto merged_weights     = merged->get_column(1).view();
+  auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED);
   auto keys = cudf::detail::make_counting_transform_iterator(
     0,
@@ -791,9 +932,12 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
     [outer_offsets = group_offsets.data(),
      inner_offsets = tdigest_offsets.begin<size_type>(),
      cumulative_weights =
-       cumulative_weights->view().begin<double>()] __device__(size_type group_index) {
+       cumulative_weights->view().begin<double>()] __device__(size_type group_index) -> double {
+      // if there's no weights in this group of digests at all, return 0.
+      auto const num_weights =
+        inner_offsets[outer_offsets[group_index + 1]] - inner_offsets[outer_offsets[group_index]];
       auto const last_weight_index = inner_offsets[outer_offsets[group_index + 1]] - 1;
-      return cumulative_weights[last_weight_index];
+      return num_weights == 0 ? 0 : cumulative_weights[last_weight_index];
     });
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -807,15 +951,15 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
       group_labels,
       group_offsets.data(),
       {tdigest_offsets.begin<offset_type>(), static_cast<size_t>(tdigest_offsets.size())}},
+    false,
     stream,
     mr);
 
   // input centroid values
   auto centroids = cudf::detail::make_counting_transform_iterator(
     0,
-    make_weighted_centroid{
-      merged->get_column(cudf::detail::tdigest::mean_column_index).view().begin<double>(),
-      merged_weights.begin<double>()});
+    make_weighted_centroid{merged->get_column(0).view().begin<double>(),
+                           merged_weights.begin<double>()});
 
   // compute the tdigest
   return compute_tdigests(delta,
@@ -831,6 +975,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                           group_cluster_wl,
                           std::move(group_cluster_offsets),
                           total_clusters,
+                          false,
                           stream,
                           mr);
 }
diff --git a/cpp/src/groupby/sort/group_util.cuh b/cpp/src/groupby/sort/group_util.cuh
new file mode 100644
index 00000000000..31ff29ed4c3
--- /dev/null
+++ b/cpp/src/groupby/sort/group_util.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/row_operators.cuh>
+
+namespace cudf {
+namespace groupby {
+namespace detail {
+
+/**
+ * @brief Binary operator ArgMin/ArgMax with index values into the input table.
+ *
+ * @tparam T Type of the underlying data. This is the fallback for the cases when T does not support
+ * '<' operator.
+ */
+template <bool has_nulls>
+struct row_arg_minmax_fn {
+  size_type const num_rows;
+  row_lexicographic_comparator<has_nulls> const comp;
+  bool const arg_min;
+
+  row_arg_minmax_fn(size_type const num_rows_,
+                    table_device_view const& table_,
+                    null_order const* null_precedence_,
+                    bool const arg_min_)
+    : num_rows(num_rows_), comp(table_, table_, nullptr, null_precedence_), arg_min(arg_min_)
+  {
+  }
+
+  // This function is explicitly prevented from inlining, because it calls to
+  // `row_lexicographic_comparator::operator()` which is inlined and very heavy-weight. As a result,
+  // instantiating this functor will result in huge code, and objects of this functor used with
+  // `thrust::reduce_by_key` or `thrust::scan_by_key` will result in significant compile time.
+  __attribute__((noinline)) __device__ auto operator()(size_type lhs_idx, size_type rhs_idx) const
+  {
+    // The extra bounds checking is due to issue github.com/rapidsai/cudf/9156 and
+    // github.com/NVIDIA/thrust/issues/1525
+    // where invalid random values may be passed here by thrust::reduce_by_key
+    if (lhs_idx < 0 || lhs_idx >= num_rows) { return rhs_idx; }
+    if (rhs_idx < 0 || rhs_idx >= num_rows) { return lhs_idx; }
+
+    // Return `lhs_idx` iff:
+    //   row(lhs_idx) <  row(rhs_idx) and finding ArgMin, or
+    //   row(lhs_idx) >= row(rhs_idx) and finding ArgMax.
+    return comp(lhs_idx, rhs_idx) == arg_min ? lhs_idx : rhs_idx;
+  }
+};
+
+}  // namespace detail
+}  // namespace groupby
+}  // namespace cudf
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index ace5d0e539c..b22f82ce7e4 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -22,6 +22,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
@@ -29,8 +30,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <structs/utilities.hpp>
-
 #include <memory>
 
 namespace cudf {
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index c4905b86ab9..1caf2ff0371 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -23,11 +23,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/traits.hpp>
-#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -102,13 +102,11 @@ sort_groupby_helper::sort_groupby_helper(table_view const& keys,
 {
   using namespace cudf::structs::detail;
 
-  auto [flattened_keys, _, __, struct_null_vectors] =
-    flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
+  _flattened                 = flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
+  _keys                      = _flattened;
   auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
-  CUDF_EXPECTS(std::all_of(flattened_keys.begin(), flattened_keys.end(), is_supported_key_type),
+  CUDF_EXPECTS(std::all_of(_keys.begin(), _keys.end(), is_supported_key_type),
                "Unsupported groupby key type does not support equality comparison");
-  _struct_null_vectors = std::move(struct_null_vectors);
-  _keys                = flattened_keys;
 
   // Cannot depend on caller's sorting if the column contains nulls,
   // and null values are to be excluded.
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index c4a9da9285d..a3f954920c8 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -538,8 +538,11 @@ class concurrent_unordered_map {
       }
     }
 
-    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
-      m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
+    if (m_capacity > 0) {
+      init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
+        m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
+    }
+
     CUDA_TRY(cudaGetLastError());
   }
 };
diff --git a/cpp/src/hash/hash_constants.hpp b/cpp/src/hash/hash_constants.hpp
deleted file mode 100644
index 0a5a9e0be93..00000000000
--- a/cpp/src/hash/hash_constants.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-namespace cudf {
-namespace detail {
-
-struct md5_intermediate_data {
-  uint64_t message_length = 0;
-  uint32_t buffer_length  = 0;
-  uint32_t hash_value[4]  = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476};
-  uint8_t buffer[64];
-};
-
-// Type for the shift constants table.
-using md5_shift_constants_type = uint32_t;
-
-__device__ __constant__ md5_shift_constants_type md5_shift_constants[16] = {
-  7,
-  12,
-  17,
-  22,
-  5,
-  9,
-  14,
-  20,
-  4,
-  11,
-  16,
-  23,
-  6,
-  10,
-  15,
-  21,
-};
-
-// Type for the hash constants table.
-using md5_hash_constants_type = uint32_t;
-
-__device__ __constant__ md5_hash_constants_type md5_hash_constants[64] = {
-  0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
-  0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
-  0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-  0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
-  0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
-  0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-  0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
-  0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
-};
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 973f3204c37..b9915da90b9 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -15,10 +15,14 @@
  */
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/hashing.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/string_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -26,71 +30,318 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
 
+#include <iterator>
+
 namespace cudf {
+
+namespace detail {
+
 namespace {
 
+// The MD5 algorithm and its hash/shift constants are officially specified in
+// RFC 1321. For convenience, these values can also be found on Wikipedia:
+// https://en.wikipedia.org/wiki/MD5
+const __constant__ uint32_t md5_shift_constants[16] = {
+  7, 12, 17, 22, 5, 9, 14, 20, 4, 11, 16, 23, 6, 10, 15, 21};
+
+const __constant__ uint32_t md5_hash_constants[64] = {
+  0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+  0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+  0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+  0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+  0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+  0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+  0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+  0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+};
+
+template <int capacity, typename hash_step_callable>
+struct hash_circular_buffer {
+  uint8_t storage[capacity];
+  uint8_t* cur;
+  int available_space{capacity};
+  hash_step_callable hash_step;
+
+  CUDA_DEVICE_CALLABLE hash_circular_buffer(hash_step_callable hash_step)
+    : cur{storage}, hash_step{hash_step}
+  {
+  }
+
+  CUDA_DEVICE_CALLABLE void put(uint8_t const* in, int size)
+  {
+    int copy_start = 0;
+    while (size >= available_space) {
+      // The buffer will be filled by this chunk of data. Copy a chunk of the
+      // data to fill the buffer and trigger a hash step.
+      memcpy(cur, in + copy_start, available_space);
+      hash_step(storage);
+      size -= available_space;
+      copy_start += available_space;
+      cur             = storage;
+      available_space = capacity;
+    }
+    // The buffer will not be filled by the remaining data. That is, `size >= 0
+    // && size < capacity`. We copy the remaining data into the buffer but do
+    // not trigger a hash step.
+    memcpy(cur, in + copy_start, size);
+    cur += size;
+    available_space -= size;
+  }
+
+  CUDA_DEVICE_CALLABLE void pad(int const space_to_leave)
+  {
+    if (space_to_leave > available_space) {
+      memset(cur, 0x00, available_space);
+      hash_step(storage);
+      cur             = storage;
+      available_space = capacity;
+    }
+    memset(cur, 0x00, available_space - space_to_leave);
+    cur += available_space - space_to_leave;
+    available_space = space_to_leave;
+  }
+
+  CUDA_DEVICE_CALLABLE const uint8_t& operator[](int idx) const { return storage[idx]; }
+};
+
+// Get a uint8_t pointer to a column element and its size as a pair.
+template <typename Element>
+auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(Element const& element)
+{
+  if constexpr (is_fixed_width<Element>() && !is_chrono<Element>()) {
+    return thrust::make_pair(reinterpret_cast<uint8_t const*>(&element), sizeof(Element));
+  } else {
+    cudf_assert(false && "Unsupported type.");
+  }
+}
+
+template <>
+auto CUDA_DEVICE_CALLABLE get_element_pointer_and_size(string_view const& element)
+{
+  return thrust::make_pair(reinterpret_cast<uint8_t const*>(element.data()), element.size_bytes());
+}
+
+struct MD5Hasher {
+  static constexpr int message_chunk_size = 64;
+
+  CUDA_DEVICE_CALLABLE MD5Hasher(char* result_location)
+    : result_location(result_location), buffer(md5_hash_step{hash_values})
+  {
+  }
+
+  CUDA_DEVICE_CALLABLE ~MD5Hasher()
+  {
+    // On destruction, finalize the message buffer and write out the current
+    // hexadecimal hash value to the result location.
+    // Add a one byte flag 0b10000000 to signal the end of the message.
+    uint8_t constexpr end_of_message = 0x80;
+    // The message length is appended to the end of the last chunk processed.
+    uint64_t const message_length_in_bits = message_length * 8;
+
+    buffer.put(&end_of_message, sizeof(end_of_message));
+    buffer.pad(sizeof(message_length_in_bits));
+    buffer.put(reinterpret_cast<uint8_t const*>(&message_length_in_bits),
+               sizeof(message_length_in_bits));
+
+    for (int i = 0; i < 4; ++i) {
+      uint32ToLowercaseHexString(hash_values[i], result_location + (8 * i));
+    }
+  }
+
+  MD5Hasher(const MD5Hasher&) = delete;
+  MD5Hasher& operator=(const MD5Hasher&) = delete;
+  MD5Hasher(MD5Hasher&&)                 = delete;
+  MD5Hasher& operator=(MD5Hasher&&) = delete;
+
+  template <typename Element>
+  void CUDA_DEVICE_CALLABLE process(Element const& element)
+  {
+    auto const normalized_element  = normalize_nans_and_zeros(element);
+    auto const [element_ptr, size] = get_element_pointer_and_size(normalized_element);
+    buffer.put(element_ptr, size);
+    message_length += size;
+  }
+
+  /**
+   * @brief Core MD5 algorithm implementation. Processes a single 64-byte chunk,
+   * updating the hash value so far. Does not zero out the buffer contents.
+   */
+  struct md5_hash_step {
+    uint32_t (&hash_values)[4];
+
+    void CUDA_DEVICE_CALLABLE operator()(const uint8_t (&buffer)[message_chunk_size])
+    {
+      uint32_t A = hash_values[0];
+      uint32_t B = hash_values[1];
+      uint32_t C = hash_values[2];
+      uint32_t D = hash_values[3];
+
+      for (int j = 0; j < message_chunk_size; j++) {
+        uint32_t F;
+        uint32_t g;
+        // No default case is needed because j < 64. j / 16 is always 0, 1, 2, or 3.
+        switch (j / 16) {
+          case 0:
+            F = (B & C) | ((~B) & D);
+            g = j;
+            break;
+          case 1:
+            F = (D & B) | ((~D) & C);
+            g = (5 * j + 1) % 16;
+            break;
+          case 2:
+            F = B ^ C ^ D;
+            g = (3 * j + 5) % 16;
+            break;
+          case 3:
+            F = C ^ (B | (~D));
+            g = (7 * j) % 16;
+            break;
+        }
+
+        uint32_t buffer_element_as_int;
+        memcpy(&buffer_element_as_int, &buffer[g * 4], 4);
+        F = F + A + md5_hash_constants[j] + buffer_element_as_int;
+        A = D;
+        D = C;
+        C = B;
+        B = B + __funnelshift_l(F, F, md5_shift_constants[((j / 16) * 4) + (j % 4)]);
+      }
+
+      hash_values[0] += A;
+      hash_values[1] += B;
+      hash_values[2] += C;
+      hash_values[3] += D;
+    }
+  };
+
+  char* result_location;
+  hash_circular_buffer<message_chunk_size, md5_hash_step> buffer;
+  uint64_t message_length = 0;
+  uint32_t hash_values[4] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476};
+};
+
+template <typename Hasher>
+struct HasherDispatcher {
+  Hasher* hasher;
+  column_device_view const& input_col;
+
+  CUDA_DEVICE_CALLABLE HasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+    : hasher{hasher}, input_col{input_col}
+  {
+  }
+
+  template <typename Element>
+  void CUDA_DEVICE_CALLABLE operator()(size_type const row_index) const
+  {
+    if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
+                  std::is_same_v<Element, string_view>) {
+      hasher->process(input_col.element<Element>(row_index));
+    } else {
+      (void)row_index;
+      cudf_assert(false && "Unsupported type for hash function.");
+    }
+  }
+};
+
+template <typename Hasher>
+struct ListHasherDispatcher {
+  Hasher* hasher;
+  column_device_view const& input_col;
+
+  CUDA_DEVICE_CALLABLE ListHasherDispatcher(Hasher* hasher, column_device_view const& input_col)
+    : hasher{hasher}, input_col{input_col}
+  {
+  }
+
+  template <typename Element>
+  void CUDA_DEVICE_CALLABLE operator()(size_type const offset_begin,
+                                       size_type const offset_end) const
+  {
+    if constexpr ((is_fixed_width<Element>() && !is_chrono<Element>()) ||
+                  std::is_same_v<Element, string_view>) {
+      for (size_type i = offset_begin; i < offset_end; i++) {
+        if (input_col.is_valid(i)) { hasher->process(input_col.element<Element>(i)); }
+      }
+    } else {
+      (void)offset_begin;
+      (void)offset_end;
+      cudf_assert(false && "Unsupported type for hash function.");
+    }
+  }
+};
+
 // MD5 supported leaf data type check
-bool md5_type_check(data_type dt)
+constexpr inline bool md5_leaf_type_check(data_type dt)
 {
-  return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING));
+  return (is_fixed_width(dt) && !is_chrono(dt)) || (dt.id() == type_id::STRING);
 }
 
 }  // namespace
 
-namespace detail {
-
 std::unique_ptr<column> md5_hash(table_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
-    const string_scalar string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
-    auto output = make_column_from_scalar(string_128bit, input.num_rows(), stream, mr);
-    return output;
+    // Return the MD5 hash of a zero-length input.
+    string_scalar const string_128bit("d41d8cd98f00b204e9orig98ecf8427e");
+    return make_column_from_scalar(string_128bit, input.num_rows(), stream, mr);
   }
 
   // Accepts string and fixed width columns, or single layer list columns holding those types
-  CUDF_EXPECTS(
-    std::all_of(input.begin(),
-                input.end(),
-                [](auto col) {
-                  return md5_type_check(col.type()) ||
-                         (col.type().id() == type_id::LIST && md5_type_check(col.child(1).type()));
-                }),
-    "MD5 unsupported column type");
+  CUDF_EXPECTS(std::all_of(input.begin(),
+                           input.end(),
+                           [](auto const& col) {
+                             if (col.type().id() == type_id::LIST) {
+                               return md5_leaf_type_check(lists_column_view(col).child().type());
+                             }
+                             return md5_leaf_type_check(col.type());
+                           }),
+               "Unsupported column type for hash function.");
 
+  // Digest size in bytes
+  auto constexpr digest_size = 32;
   // Result column allocation and creation
-  auto begin = thrust::make_constant_iterator(32);
+  auto begin = thrust::make_constant_iterator(digest_size);
   auto offsets_column =
     cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
-  auto chars_column = strings::detail::create_chars_child_column(input.num_rows() * 32, stream, mr);
-  auto chars_view   = chars_column->mutable_view();
-  auto d_chars      = chars_view.data<char>();
+  auto chars_column =
+    strings::detail::create_chars_child_column(input.num_rows() * digest_size, stream, mr);
+  auto chars_view = chars_column->mutable_view();
+  auto d_chars    = chars_view.data<char>();
 
   rmm::device_buffer null_mask{0, stream, mr};
 
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     md5_intermediate_data hash_state;
-                     MD5Hash hasher = MD5Hash{};
-                     for (int col_index = 0; col_index < device_input.num_columns(); col_index++) {
-                       if (device_input.column(col_index).is_valid(row_index)) {
-                         cudf::type_dispatcher<dispatch_storage_type>(
-                           device_input.column(col_index).type(),
-                           hasher,
-                           device_input.column(col_index),
-                           row_index,
-                           &hash_state);
-                       }
-                     }
-                     hasher.finalize(&hash_state, d_chars + (row_index * 32));
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.num_rows()),
+    [d_chars, device_input = *device_input] __device__(auto row_index) {
+      MD5Hasher hasher(d_chars + (row_index * digest_size));
+      for (auto const& col : device_input) {
+        if (col.is_valid(row_index)) {
+          if (col.type().id() == type_id::LIST) {
+            auto const data_col = col.child(lists_column_view::child_column_index);
+            auto const offsets  = col.child(lists_column_view::offsets_column_index);
+            if (data_col.type().id() == type_id::LIST) {
+              cudf_assert(false && "Nested list unsupported");
+            }
+            auto const offset_begin = offsets.element<size_type>(row_index);
+            auto const offset_end   = offsets.element<size_type>(row_index + 1);
+            cudf::type_dispatcher<dispatch_storage_type>(
+              data_col.type(), ListHasherDispatcher(&hasher, data_col), offset_begin, offset_end);
+          } else {
+            cudf::type_dispatcher<dispatch_storage_type>(
+              col.type(), HasherDispatcher(&hasher, col), row_index);
+          }
+        }
+      }
+    });
 
   return make_strings_column(
     input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, std::move(null_mask));
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index db55c82f109..59095fef85e 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -268,7 +268,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (array.length() == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (array.length() == 0) { return make_empty_column(type_id::STRING); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
     str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 219d5759353..a481da38d30 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -224,7 +224,7 @@ std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = durations.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   return type_dispatcher(durations.type(), dispatch_from_durations_fn{}, durations, stream, mr);
 }
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 99b593c99b9..7f032b6987c 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -19,14 +19,21 @@
  * @brief cuDF-IO CSV reader class implementation
  */
 
-#include "reader_impl.hpp"
+#include "csv_common.h"
+#include "csv_gpu.h"
 
 #include <io/comp/io_uncomp.h>
+#include <io/utilities/column_buffer.hpp>
+#include <io/utilities/hostdevice_vector.hpp>
 #include <io/utilities/parsing_utils.cuh>
 #include <io/utilities/type_conversion.hpp>
 
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/table/table.hpp>
@@ -37,10 +44,14 @@
 
 #include <algorithm>
 #include <iostream>
+#include <memory>
 #include <numeric>
+#include <string>
 #include <tuple>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 using std::string;
 using std::vector;
@@ -56,27 +67,40 @@ namespace csv {
 using namespace cudf::io::csv;
 using namespace cudf::io;
 
+namespace {
+
 /**
- * @brief Translates a dtype string and returns its dtype enumeration and any
- * extended dtype flags that are supported by cuIO. Often, this is a column
- * with the same underlying dtype the basic types, but with different parsing
- * interpretations.
- *
- * @param[in] dtype String containing the basic or extended dtype
+ * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
  *
- * @return Tuple of data_type and flags
+ * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
+ * elements.
  */
-std::tuple<data_type, column_parse::flags> get_dtype_info(const std::string& dtype)
-{
-  if (dtype == "hex" || dtype == "hex64") {
-    return std::make_tuple(data_type{cudf::type_id::INT64}, column_parse::as_hexadecimal);
-  }
-  if (dtype == "hex32") {
-    return std::make_tuple(data_type{cudf::type_id::INT32}, column_parse::as_hexadecimal);
+class selected_rows_offsets {
+  rmm::device_uvector<uint64_t> all;
+  device_span<uint64_t const> selected;
+
+ public:
+  selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
+                        device_span<uint64_t const> selected_span)
+    : all{std::move(data)}, selected{selected_span}
+  {
   }
+  selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
 
-  return std::make_tuple(convert_string_to_dtype(dtype), column_parse::as_default);
-}
+  operator device_span<uint64_t const>() const { return selected; }
+  void shrink(size_t size)
+  {
+    CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
+    selected = selected.subspan(0, size);
+  }
+  void erase_first_n(size_t n)
+  {
+    CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
+    selected = selected.subspan(n, selected.size() - n);
+  }
+  auto size() const { return selected.size(); }
+  auto data() const { return selected.data(); }
+};
 
 /**
  * @brief Removes the first and Last quote in the string
@@ -96,10 +120,10 @@ string removeQuotes(string str, char quotechar)
  * @brief Parse the first row to set the column names in the raw_csv parameter.
  * The first row can be either the header row, or the first data row
  */
-std::vector<std::string> setColumnNames(std::vector<char> const& header,
-                                        parse_options_view const& opts,
-                                        int header_row,
-                                        std::string prefix)
+std::vector<std::string> get_column_names(std::vector<char> const& header,
+                                          parse_options_view const& parse_opts,
+                                          int header_row,
+                                          std::string prefix)
 {
   std::vector<std::string> col_names;
 
@@ -112,35 +136,36 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
   bool quotation = false;
   for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
     // Flip the quotation flag if current character is a quotechar
-    if (first_row[pos] == opts.quotechar) {
+    if (first_row[pos] == parse_opts.quotechar) {
       quotation = !quotation;
     }
     // Check if end of a column/row
-    else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts.terminator) ||
-             (!quotation && first_row[pos] == opts.delimiter)) {
+    else if (pos == first_row.size() - 1 ||
+             (!quotation && first_row[pos] == parse_opts.terminator) ||
+             (!quotation && first_row[pos] == parse_opts.delimiter)) {
       // This is the header, add the column name
       if (header_row >= 0) {
         // Include the current character, in case the line is not terminated
         int col_name_len = pos - prev + 1;
         // Exclude the delimiter/terminator is present
-        if (first_row[pos] == opts.delimiter || first_row[pos] == opts.terminator) {
+        if (first_row[pos] == parse_opts.delimiter || first_row[pos] == parse_opts.terminator) {
           --col_name_len;
         }
         // Also exclude '\r' character at the end of the column name if it's
         // part of the terminator
-        if (col_name_len > 0 && opts.terminator == '\n' && first_row[pos] == '\n' &&
+        if (col_name_len > 0 && parse_opts.terminator == '\n' && first_row[pos] == '\n' &&
             first_row[pos - 1] == '\r') {
           --col_name_len;
         }
 
         const string new_col_name(first_row.data() + prev, col_name_len);
-        col_names.push_back(removeQuotes(new_col_name, opts.quotechar));
+        col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
 
         // Stop parsing when we hit the line terminator; relevant when there is
         // a blank line following the header. In this case, first_row includes
         // multiple line terminators at the end, as the new recStart belongs to
         // a line that comes after the blank line(s)
-        if (!quotation && first_row[pos] == opts.terminator) { break; }
+        if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
       } else {
         // This is the first data row, add the automatically generated name
         col_names.push_back(prefix + std::to_string(num_cols));
@@ -148,8 +173,8 @@ std::vector<std::string> setColumnNames(std::vector<char> const& header,
       num_cols++;
 
       // Skip adjacent delimiters if delim_whitespace is set
-      while (opts.multi_delimiter && pos < first_row.size() && first_row[pos] == opts.delimiter &&
-             first_row[pos + 1] == opts.delimiter) {
+      while (parse_opts.multi_delimiter && pos < first_row.size() &&
+             first_row[pos] == parse_opts.delimiter && first_row[pos + 1] == parse_opts.delimiter) {
         ++pos;
       }
       prev = pos + 1;
@@ -170,277 +195,43 @@ void erase_except_last(C& container, rmm::cuda_stream_view stream)
   container.resize(1, stream);
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::select_data_and_row_offsets(rmm::cuda_stream_view stream)
-{
-  auto range_offset      = opts_.get_byte_range_offset();
-  auto range_size        = opts_.get_byte_range_size();
-  auto range_size_padded = opts_.get_byte_range_size_with_padding();
-  auto skip_rows         = opts_.get_skiprows();
-  auto skip_end_rows     = opts_.get_skipfooter();
-  auto num_rows          = opts_.get_nrows();
-
-  if (range_offset > 0 || range_size > 0) {
-    CUDF_EXPECTS(opts_.get_compression() == compression_type::NONE,
-                 "Reading compressed data using `byte range` is unsupported");
-  }
-
-  // Transfer source data to GPU
-  if (!source_->is_empty()) {
-    auto const data_size = (range_size_padded != 0) ? range_size_padded : source_->size();
-    auto const buffer    = source_->host_read(range_offset, data_size);
-
-    auto h_data = host_span<char const>(  //
-      reinterpret_cast<const char*>(buffer->data()),
-      buffer->size());
-
-    std::vector<char> h_uncomp_data_owner;
-
-    if (opts_.get_compression() != compression_type::NONE) {
-      h_uncomp_data_owner = get_uncompressed_data(h_data, opts_.get_compression());
-      h_data              = h_uncomp_data_owner;
-    }
-
-    // None of the parameters for row selection is used, we are parsing the entire file
-    const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
-                                 skip_end_rows <= 0 && num_rows == -1;
-
-    // With byte range, find the start of the first data row
-    size_t const data_start_offset = (range_offset != 0) ? find_first_row_start(h_data) : 0;
-
-    // TODO: Allow parsing the header outside the mapped range
-    CUDF_EXPECTS((range_offset == 0 || opts_.get_header() < 0),
-                 "byte_range offset with header not supported");
-
-    // Gather row offsets
-    auto data_row_offsets =
-      load_data_and_gather_row_offsets(h_data,
-                                       data_start_offset,
-                                       (range_size) ? range_size : h_data.size(),
-                                       (skip_rows > 0) ? skip_rows : 0,
-                                       num_rows,
-                                       load_whole_file,
-                                       stream);
-    auto& row_offsets = data_row_offsets.second;
-    // Exclude the rows that are to be skipped from the end
-    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
-      row_offsets.shrink(row_offsets.size() - skip_end_rows);
-    }
-    return data_row_offsets;
-  }
-  return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
-}
-
-std::vector<data_type> reader::impl::select_data_types(
-  std::map<std::string, data_type> const& col_type_map)
-{
-  std::vector<data_type> selected_dtypes;
-
-  for (int col = 0; col < num_actual_cols_; col++) {
-    if (column_flags_[col] & column_parse::enabled) {
-      auto const col_type_it = col_type_map.find(col_names_[col]);
-      CUDF_EXPECTS(col_type_it != col_type_map.end(),
-                   "Must specify data types for all active columns");
-      selected_dtypes.emplace_back(col_type_it->second);
-    }
-  }
-  return selected_dtypes;
-}
-
-std::vector<data_type> reader::impl::select_data_types(std::vector<data_type> const& dtypes)
-{
-  std::vector<data_type> selected_dtypes;
-
-  if (dtypes.size() == 1) {
-    // If it's a single dtype, assign that dtype to all active columns
-    selected_dtypes.resize(num_active_cols_, dtypes.front());
-  } else {
-    // If it's a list, assign dtypes to active columns in the given order
-    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_cols_,
-                 "Must specify data types for all columns");
-
-    for (int col = 0; col < num_actual_cols_; col++) {
-      if (column_flags_[col] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[col]); }
-    }
-  }
-  return selected_dtypes;
-}
-
-table_with_metadata reader::impl::read(rmm::cuda_stream_view stream)
-{
-  auto const data_row_offsets = select_data_and_row_offsets(stream);
-  auto const& data            = data_row_offsets.first;
-  auto const& row_offsets     = data_row_offsets.second;
-
-  // Exclude the end-of-data row from number of rows with actual data
-  num_records_ = std::max(row_offsets.size(), 1ul) - 1;
-
-  // Check if the user gave us a list of column names
-  if (not opts_.get_names().empty()) {
-    column_flags_.resize(opts_.get_names().size(), column_parse::enabled);
-    col_names_ = opts_.get_names();
-  } else {
-    col_names_ = setColumnNames(header_, opts.view(), opts_.get_header(), opts_.get_prefix());
-
-    num_actual_cols_ = num_active_cols_ = col_names_.size();
-
-    column_flags_.resize(num_actual_cols_, column_parse::enabled);
-
-    // Rename empty column names to "Unnamed: col_index"
-    for (size_t col_idx = 0; col_idx < col_names_.size(); ++col_idx) {
-      if (col_names_[col_idx].empty()) {
-        col_names_[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
-      }
-    }
-
-    // Looking for duplicates
-    std::unordered_map<string, int> col_names_histogram;
-    for (auto& col_name : col_names_) {
-      // Operator [] inserts a default-initialized value if the given key is not
-      // present
-      if (++col_names_histogram[col_name] > 1) {
-        if (opts_.is_enabled_mangle_dupe_cols()) {
-          // Rename duplicates of column X as X.1, X.2, ...; First appearance
-          // stays as X
-          do {
-            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
-          } while (col_names_histogram[col_name]++);
-        } else {
-          // All duplicate columns will be ignored; First appearance is parsed
-          const auto idx     = &col_name - col_names_.data();
-          column_flags_[idx] = column_parse::disabled;
-        }
-      }
-    }
-
-    // Update the number of columns to be processed, if some might have been
-    // removed
-    if (!opts_.is_enabled_mangle_dupe_cols()) { num_active_cols_ = col_names_histogram.size(); }
-  }
-
-  // User can specify which columns should be parsed
-  if (!opts_.get_use_cols_indexes().empty() || !opts_.get_use_cols_names().empty()) {
-    std::fill(column_flags_.begin(), column_flags_.end(), column_parse::disabled);
-
-    for (const auto index : opts_.get_use_cols_indexes()) {
-      column_flags_[index] = column_parse::enabled;
-    }
-    num_active_cols_ = std::unordered_set<int>(opts_.get_use_cols_indexes().begin(),
-                                               opts_.get_use_cols_indexes().end())
-                         .size();
-
-    for (const auto& name : opts_.get_use_cols_names()) {
-      const auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        auto curr_it = it - col_names_.begin();
-        if (column_flags_[curr_it] == column_parse::disabled) {
-          column_flags_[curr_it] = column_parse::enabled;
-          num_active_cols_++;
-        }
-      }
-    }
-  }
-
-  // User can specify which columns should be read as datetime
-  if (!opts_.get_parse_dates_indexes().empty() || !opts_.get_parse_dates_names().empty()) {
-    for (const auto index : opts_.get_parse_dates_indexes()) {
-      column_flags_[index] |= column_parse::as_datetime;
-    }
-
-    for (const auto& name : opts_.get_parse_dates_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_datetime;
-      }
-    }
-  }
-
-  // User can specify which columns should be parsed as hexadecimal
-  if (!opts_.get_parse_hex_indexes().empty() || !opts_.get_parse_hex_names().empty()) {
-    for (const auto index : opts_.get_parse_hex_indexes()) {
-      column_flags_[index] |= column_parse::as_hexadecimal;
-    }
-
-    for (const auto& name : opts_.get_parse_hex_names()) {
-      auto it = std::find(col_names_.begin(), col_names_.end(), name);
-      if (it != col_names_.end()) {
-        column_flags_[it - col_names_.begin()] |= column_parse::as_hexadecimal;
-      }
-    }
-  }
-
-  // Return empty table rather than exception if nothing to load
-  if (num_active_cols_ == 0) { return {std::make_unique<table>(), {}}; }
-
-  auto metadata    = table_metadata{};
-  auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
-
-  bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, opts_.get_dtypes());
-
-  std::vector<data_type> column_types;
-  if (has_to_infer_column_types) {
-    column_types = infer_column_types(data, row_offsets, stream);
-  } else {
-    column_types = std::visit([&](auto const& data_types) { return select_data_types(data_types); },
-                              opts_.get_dtypes());
-  }
-
-  out_columns.reserve(column_types.size());
-
-  if (num_records_ != 0) {
-    auto out_buffers = decode_data(data, row_offsets, column_types, stream);
-    for (size_t i = 0; i < column_types.size(); ++i) {
-      metadata.column_names.emplace_back(out_buffers[i].name);
-      if (column_types[i].id() == type_id::STRING && opts.quotechar != '\0' &&
-          opts.doublequote == true) {
-        // PANDAS' default behavior of enabling doublequote for two consecutive
-        // quotechars in quoted fields results in reduction to a single quotechar
-        // TODO: Would be much more efficient to perform this operation in-place
-        // during the conversion stage
-        const std::string quotechar(1, opts.quotechar);
-        const std::string dblquotechar(2, opts.quotechar);
-        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
-        out_columns.emplace_back(
-          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr_));
-      } else {
-        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr_));
-      }
-    }
-  } else {
-    // Create empty columns
-    for (size_t i = 0; i < column_types.size(); ++i) {
-      out_columns.emplace_back(make_empty_column(column_types[i]));
-    }
-    // Handle empty metadata
-    for (int col = 0; col < num_actual_cols_; ++col) {
-      if (column_flags_[col] & column_parse::enabled) {
-        metadata.column_names.emplace_back(col_names_[col]);
-      }
-    }
-  }
-  return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
-}
-
-size_t reader::impl::find_first_row_start(host_span<char const> data)
+size_t find_first_row_start(char row_terminator, host_span<char const> data)
 {
   // For now, look for the first terminator (assume the first terminator isn't within a quote)
   // TODO: Attempt to infer this from the data
   size_t pos = 0;
-  while (pos < data.size() && data[pos] != opts.terminator) {
+  while (pos < data.size() && data[pos] != row_terminator) {
     ++pos;
   }
   return std::min(pos + 1, data.size());
 }
 
-std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
-                                               size_t range_begin,
-                                               size_t range_end,
-                                               size_t skip_rows,
-                                               int64_t num_rows,
-                                               bool load_whole_file,
-                                               rmm::cuda_stream_view stream)
+/**
+ * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
+ *
+ * This function scans the input data to record the row offsets (relative to the start of the
+ * input data). A row is actually the data/offset between two termination symbols.
+ *
+ * @param data Uncompressed input data in host memory
+ * @param range_begin Only include rows starting after this position
+ * @param range_end Only include rows starting before this position
+ * @param skip_rows Number of rows to skip from the start
+ * @param num_rows Number of rows to read; -1: all remaining data
+ * @param load_whole_file Hint that the entire data will be needed on gpu
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Input data and row offsets in the device memory
+ */
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> load_data_and_gather_row_offsets(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  std::vector<char>& header,
+  host_span<char const> data,
+  size_t range_begin,
+  size_t range_end,
+  size_t skip_rows,
+  int64_t num_rows,
+  bool load_whole_file,
+  rmm::cuda_stream_view stream)
 {
   constexpr size_t max_chunk_bytes = 64 * 1024 * 1024;  // 64MB
   size_t buffer_size               = std::min(max_chunk_bytes, data.size());
@@ -449,7 +240,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   hostdevice_vector<uint64_t> row_ctx(max_blocks);
   size_t buffer_pos  = std::min(range_begin - std::min(range_begin, sizeof(char)), data.size());
   size_t pos         = std::min(range_begin, data.size());
-  size_t header_rows = (opts_.get_header() >= 0) ? opts_.get_header() + 1 : 0;
+  size_t header_rows = (reader_opts.get_header() >= 0) ? reader_opts.get_header() + 1 : 0;
   uint64_t ctx       = 0;
 
   // For compatibility with the previous parser, a row is considered in-range if the
@@ -475,7 +266,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
 
     // Pass 1: Count the potential number of rows in each character block for each
     // possible parser state at the beginning of the block.
-    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+    uint32_t num_blocks = cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                                                  row_ctx.device_ptr(),
                                                                  device_span<uint64_t>(),
                                                                  d_data,
@@ -514,7 +305,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
                                stream.value()));
 
       // Pass 2: Output row offsets
-      cudf::io::csv::gpu::gather_row_offsets(opts.view(),
+      cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(),
                                              row_ctx.device_ptr(),
                                              all_row_offsets,
                                              d_data,
@@ -551,8 +342,8 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
       // num_rows does not include blank rows
       if (num_rows >= 0) {
         if (all_row_offsets.size() > header_rows + static_cast<size_t>(num_rows)) {
-          size_t num_blanks =
-            cudf::io::csv::gpu::count_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+          size_t num_blanks = cudf::io::csv::gpu::count_blank_rows(
+            parse_opts.view(), d_data, all_row_offsets, stream);
           if (all_row_offsets.size() - num_blanks > header_rows + static_cast<size_t>(num_rows)) {
             // Got the desired number of rows
             break;
@@ -571,7 +362,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   } while (pos < data.size());
 
   auto const non_blank_row_offsets =
-    io::csv::gpu::remove_blank_rows(opts.view(), d_data, all_row_offsets, stream);
+    io::csv::gpu::remove_blank_rows(parse_opts.view(), d_data, all_row_offsets, stream);
   auto row_offsets = selected_rows_offsets{std::move(all_row_offsets), non_blank_row_offsets};
 
   // Remove header rows and extract header
@@ -588,7 +379,7 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
     const auto header_end   = buffer_pos + row_ctx[1];
     CUDF_EXPECTS(header_start <= header_end && header_end <= data.size(),
                  "Invalid csv header location");
-    header_.assign(data.begin() + header_start, data.begin() + header_end);
+    header.assign(data.begin() + header_start, data.begin() + header_end);
     if (header_rows > 0) { row_offsets.erase_first_n(header_rows); }
   }
   // Apply num_rows limit
@@ -598,30 +389,145 @@ reader::impl::load_data_and_gather_row_offsets(host_span<char const> data,
   return {std::move(d_data), std::move(row_offsets)};
 }
 
-std::vector<data_type> reader::impl::infer_column_types(device_span<char const> data,
-                                                        device_span<uint64_t const> row_offsets,
-                                                        rmm::cuda_stream_view stream)
+std::pair<rmm::device_uvector<char>, selected_rows_offsets> select_data_and_row_offsets(
+  cudf::io::datasource* source,
+  csv_reader_options const& reader_opts,
+  std::vector<char>& header,
+  parse_options const& parse_opts,
+  rmm::cuda_stream_view stream)
+{
+  auto range_offset      = reader_opts.get_byte_range_offset();
+  auto range_size        = reader_opts.get_byte_range_size();
+  auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
+  auto skip_rows         = reader_opts.get_skiprows();
+  auto skip_end_rows     = reader_opts.get_skipfooter();
+  auto num_rows          = reader_opts.get_nrows();
+
+  if (range_offset > 0 || range_size > 0) {
+    CUDF_EXPECTS(reader_opts.get_compression() == compression_type::NONE,
+                 "Reading compressed data using `byte range` is unsupported");
+  }
+
+  // Transfer source data to GPU
+  if (!source->is_empty()) {
+    auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
+    auto buffer    = source->host_read(range_offset, data_size);
+
+    auto h_data = host_span<char const>(  //
+      reinterpret_cast<const char*>(buffer->data()),
+      buffer->size());
+
+    std::vector<char> h_uncomp_data_owner;
+
+    if (reader_opts.get_compression() != compression_type::NONE) {
+      h_uncomp_data_owner = get_uncompressed_data(h_data, reader_opts.get_compression());
+      h_data              = h_uncomp_data_owner;
+    }
+    // None of the parameters for row selection is used, we are parsing the entire file
+    const bool load_whole_file = range_offset == 0 && range_size == 0 && skip_rows <= 0 &&
+                                 skip_end_rows <= 0 && num_rows == -1;
+
+    // With byte range, find the start of the first data row
+    size_t const data_start_offset =
+      (range_offset != 0) ? find_first_row_start(parse_opts.terminator, h_data) : 0;
+
+    // TODO: Allow parsing the header outside the mapped range
+    CUDF_EXPECTS((range_offset == 0 || reader_opts.get_header() < 0),
+                 "byte_range offset with header not supported");
+
+    // Gather row offsets
+    auto data_row_offsets =
+      load_data_and_gather_row_offsets(reader_opts,
+                                       parse_opts,
+                                       header,
+                                       h_data,
+                                       data_start_offset,
+                                       (range_size) ? range_size : h_data.size(),
+                                       (skip_rows > 0) ? skip_rows : 0,
+                                       num_rows,
+                                       load_whole_file,
+                                       stream);
+    auto& row_offsets = data_row_offsets.second;
+    // Exclude the rows that are to be skipped from the end
+    if (skip_end_rows > 0 && static_cast<size_t>(skip_end_rows) < row_offsets.size()) {
+      row_offsets.shrink(row_offsets.size() - skip_end_rows);
+    }
+    return data_row_offsets;
+  }
+  return {rmm::device_uvector<char>{0, stream}, selected_rows_offsets{stream}};
+}
+
+std::vector<data_type> select_data_types(std::vector<column_parse::flags> const& column_flags,
+                                         std::vector<data_type> const& dtypes,
+                                         int32_t num_actual_columns,
+                                         int32_t num_active_columns)
+{
+  std::vector<data_type> selected_dtypes;
+
+  if (dtypes.size() == 1) {
+    // If it's a single dtype, assign that dtype to all active columns
+    selected_dtypes.resize(num_active_columns, dtypes.front());
+  } else {
+    // If it's a list, assign dtypes to active columns in the given order
+    CUDF_EXPECTS(static_cast<int>(dtypes.size()) >= num_actual_columns,
+                 "Must specify data types for all columns");
+
+    for (int i = 0; i < num_actual_columns; i++) {
+      if (column_flags[i] & column_parse::enabled) { selected_dtypes.emplace_back(dtypes[i]); }
+    }
+  }
+  return selected_dtypes;
+}
+
+std::vector<data_type> get_data_types_from_column_names(
+  std::vector<column_parse::flags> const& column_flags,
+  std::map<std::string, data_type> const& column_type_map,
+  std::vector<std::string> const& column_names,
+  int32_t num_actual_columns)
+{
+  std::vector<data_type> selected_dtypes;
+
+  for (int32_t i = 0; i < num_actual_columns; i++) {
+    if (column_flags[i] & column_parse::enabled) {
+      auto const col_type_it = column_type_map.find(column_names[i]);
+      CUDF_EXPECTS(col_type_it != column_type_map.end(),
+                   "Must specify data types for all active columns");
+      selected_dtypes.emplace_back(col_type_it->second);
+    }
+  }
+
+  return selected_dtypes;
+}
+
+std::vector<data_type> infer_column_types(parse_options const& parse_opts,
+                                          std::vector<column_parse::flags> const& column_flags,
+                                          device_span<char const> data,
+                                          device_span<uint64_t const> row_offsets,
+                                          int32_t num_records,
+                                          int32_t num_active_columns,
+                                          data_type timestamp_type,
+                                          rmm::cuda_stream_view stream)
 {
   std::vector<data_type> dtypes;
-  if (num_records_ == 0) {
-    dtypes.resize(num_active_cols_, data_type{type_id::EMPTY});
+  if (num_records == 0) {
+    dtypes.resize(num_active_columns, data_type{type_id::EMPTY});
   } else {
     auto column_stats =
-      cudf::io::csv::gpu::detect_column_types(opts.view(),
+      cudf::io::csv::gpu::detect_column_types(parse_opts.view(),
                                               data,
-                                              make_device_uvector_async(column_flags_, stream),
+                                              make_device_uvector_async(column_flags, stream),
                                               row_offsets,
-                                              num_active_cols_,
+                                              num_active_columns,
                                               stream);
 
     stream.synchronize();
 
-    for (int col = 0; col < num_active_cols_; col++) {
+    for (int col = 0; col < num_active_columns; col++) {
       unsigned long long int_count_total = column_stats[col].big_int_count +
                                            column_stats[col].negative_small_int_count +
                                            column_stats[col].positive_small_int_count;
 
-      if (column_stats[col].null_count == num_records_) {
+      if (column_stats[col].null_count == num_records) {
         // Entire column is NULL; allocate the smallest amount of memory
         dtypes.emplace_back(cudf::type_id::INT8);
       } else if (column_stats[col].string_count > 0L) {
@@ -649,9 +555,9 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
     }
   }
 
-  if (opts_.get_timestamp_type().id() != cudf::type_id::EMPTY) {
+  if (timestamp_type.id() != cudf::type_id::EMPTY) {
     for (auto& type : dtypes) {
-      if (cudf::is_timestamp(type)) { type = opts_.get_timestamp_type(); }
+      if (cudf::is_timestamp(type)) { type = timestamp_type; }
     }
   }
 
@@ -663,43 +569,50 @@ std::vector<data_type> reader::impl::infer_column_types(device_span<char const>
   return dtypes;
 }
 
-std::vector<column_buffer> reader::impl::decode_data(device_span<char const> data,
-                                                     device_span<uint64_t const> row_offsets,
-                                                     host_span<data_type const> column_types,
-                                                     rmm::cuda_stream_view stream)
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       std::vector<column_parse::flags> const& column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
   out_buffers.reserve(column_types.size());
 
-  for (int col = 0, active_col = 0; col < num_actual_cols_; ++col) {
-    if (column_flags_[col] & column_parse::enabled) {
+  for (int col = 0, active_col = 0; col < num_actual_columns; ++col) {
+    if (column_flags[col] & column_parse::enabled) {
       const bool is_final_allocation = column_types[active_col].id() != type_id::STRING;
       auto out_buffer =
         column_buffer(column_types[active_col],
-                      num_records_,
+                      num_records,
                       true,
                       stream,
-                      is_final_allocation ? mr_ : rmm::mr::get_current_device_resource());
+                      is_final_allocation ? mr : rmm::mr::get_current_device_resource());
 
-      out_buffer.name         = col_names_[col];
+      out_buffer.name         = column_names[col];
       out_buffer.null_count() = UNKNOWN_NULL_COUNT;
       out_buffers.emplace_back(std::move(out_buffer));
       active_col++;
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_cols_);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_cols_);
+  thrust::host_vector<void*> h_data(num_active_columns);
+  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
 
-  for (int i = 0; i < num_active_cols_; ++i) {
+  for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
     h_valid[i] = out_buffers[i].null_mask();
   }
 
-  cudf::io::csv::gpu::decode_row_column_data(opts.view(),
+  cudf::io::csv::gpu::decode_row_column_data(parse_opts.view(),
                                              data,
-                                             make_device_uvector_async(column_flags_, stream),
+                                             make_device_uvector_async(column_flags, stream),
                                              row_offsets,
                                              make_device_uvector_async(column_types, stream),
                                              make_device_uvector_async(h_data, stream),
@@ -709,6 +622,209 @@ std::vector<column_buffer> reader::impl::decode_data(device_span<char const> dat
   return out_buffers;
 }
 
+table_with_metadata read_csv(cudf::io::datasource* source,
+                             csv_reader_options const& reader_opts,
+                             parse_options const& parse_opts,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
+{
+  std::vector<char> header;
+
+  auto const data_row_offsets =
+    select_data_and_row_offsets(source, reader_opts, header, parse_opts, stream);
+
+  auto const& data        = data_row_offsets.first;
+  auto const& row_offsets = data_row_offsets.second;
+
+  // Exclude the end-of-data row from number of rows with actual data
+  auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
+  auto column_flags       = std::vector<column_parse::flags>();
+  auto column_names       = std::vector<std::string>();
+  auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
+  auto num_active_columns = num_actual_columns;
+
+  // Check if the user gave us a list of column names
+  if (not reader_opts.get_names().empty()) {
+    column_flags.resize(reader_opts.get_names().size(), column_parse::enabled);
+    column_names = reader_opts.get_names();
+  } else {
+    column_names = get_column_names(
+      header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
+
+    num_actual_columns = num_active_columns = column_names.size();
+
+    column_flags.resize(num_actual_columns, column_parse::enabled);
+
+    // Rename empty column names to "Unnamed: col_index"
+    for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
+      if (column_names[col_idx].empty()) {
+        column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
+      }
+    }
+
+    // Looking for duplicates
+    std::unordered_map<string, int> col_names_histogram;
+    for (auto& col_name : column_names) {
+      // Operator [] inserts a default-initialized value if the given key is not
+      // present
+      if (++col_names_histogram[col_name] > 1) {
+        if (reader_opts.is_enabled_mangle_dupe_cols()) {
+          // Rename duplicates of column X as X.1, X.2, ...; First appearance
+          // stays as X
+          do {
+            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
+          } while (col_names_histogram[col_name]++);
+        } else {
+          // All duplicate columns will be ignored; First appearance is parsed
+          const auto idx    = &col_name - column_names.data();
+          column_flags[idx] = column_parse::disabled;
+        }
+      }
+    }
+
+    // Update the number of columns to be processed, if some might have been
+    // removed
+    if (!reader_opts.is_enabled_mangle_dupe_cols()) {
+      num_active_columns = col_names_histogram.size();
+    }
+  }
+
+  // User can specify which columns should be parsed
+  if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
+    std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
+
+    for (const auto index : reader_opts.get_use_cols_indexes()) {
+      column_flags[index] = column_parse::enabled;
+    }
+    num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
+                                                 reader_opts.get_use_cols_indexes().end())
+                           .size();
+
+    for (const auto& name : reader_opts.get_use_cols_names()) {
+      const auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        auto curr_it = it - column_names.begin();
+        if (column_flags[curr_it] == column_parse::disabled) {
+          column_flags[curr_it] = column_parse::enabled;
+          num_active_columns++;
+        }
+      }
+    }
+  }
+
+  // User can specify which columns should be read as datetime
+  if (!reader_opts.get_parse_dates_indexes().empty() ||
+      !reader_opts.get_parse_dates_names().empty()) {
+    for (const auto index : reader_opts.get_parse_dates_indexes()) {
+      column_flags[index] |= column_parse::as_datetime;
+    }
+
+    for (const auto& name : reader_opts.get_parse_dates_names()) {
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_datetime;
+      }
+    }
+  }
+
+  // User can specify which columns should be parsed as hexadecimal
+  if (!reader_opts.get_parse_hex_indexes().empty() || !reader_opts.get_parse_hex_names().empty()) {
+    for (const auto index : reader_opts.get_parse_hex_indexes()) {
+      column_flags[index] |= column_parse::as_hexadecimal;
+    }
+
+    for (const auto& name : reader_opts.get_parse_hex_names()) {
+      auto it = std::find(column_names.begin(), column_names.end(), name);
+      if (it != column_names.end()) {
+        column_flags[it - column_names.begin()] |= column_parse::as_hexadecimal;
+      }
+    }
+  }
+
+  // Return empty table rather than exception if nothing to load
+  if (num_active_columns == 0) { return {std::make_unique<table>(), {}}; }
+
+  auto metadata    = table_metadata{};
+  auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
+
+  bool has_to_infer_column_types =
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
+
+  std::vector<data_type> column_types;
+  if (has_to_infer_column_types) {
+    column_types = infer_column_types(  //
+      parse_opts,
+      column_flags,
+      data,
+      row_offsets,
+      num_records,
+      num_active_columns,
+      reader_opts.get_timestamp_type(),
+      stream);
+  } else {
+    column_types =
+      std::visit(cudf::detail::visitor_overload{
+                   [&](const std::vector<data_type>& data_types) {
+                     return select_data_types(
+                       column_flags, data_types, num_actual_columns, num_active_columns);
+                   },
+                   [&](const std::map<std::string, data_type>& data_types) {
+                     return get_data_types_from_column_names(  //
+                       column_flags,
+                       data_types,
+                       column_names,
+                       num_actual_columns);
+                   }},
+                 reader_opts.get_dtypes());
+  }
+
+  out_columns.reserve(column_types.size());
+
+  if (num_records != 0) {
+    auto out_buffers = decode_data(  //
+      parse_opts,
+      column_flags,
+      column_names,
+      data,
+      row_offsets,
+      column_types,
+      num_records,
+      num_actual_columns,
+      num_active_columns,
+      stream,
+      mr);
+    for (size_t i = 0; i < column_types.size(); ++i) {
+      metadata.column_names.emplace_back(out_buffers[i].name);
+      if (column_types[i].id() == type_id::STRING && parse_opts.quotechar != '\0' &&
+          parse_opts.doublequote == true) {
+        // PANDAS' default behavior of enabling doublequote for two consecutive
+        // quotechars in quoted fields results in reduction to a single quotechar
+        // TODO: Would be much more efficient to perform this operation in-place
+        // during the conversion stage
+        const std::string quotechar(1, parse_opts.quotechar);
+        const std::string dblquotechar(2, parse_opts.quotechar);
+        std::unique_ptr<column> col = cudf::make_strings_column(*out_buffers[i]._strings, stream);
+        out_columns.emplace_back(
+          cudf::strings::replace(col->view(), dblquotechar, quotechar, -1, mr));
+      } else {
+        out_columns.emplace_back(make_column(out_buffers[i], nullptr, stream, mr));
+      }
+    }
+  } else {
+    // Create empty columns
+    for (size_t i = 0; i < column_types.size(); ++i) {
+      out_columns.emplace_back(make_empty_column(column_types[i]));
+    }
+    // Handle empty metadata
+    for (int col = 0; col < num_actual_columns; ++col) {
+      if (column_flags[col] & column_parse::enabled) {
+        metadata.column_names.emplace_back(column_names[col]);
+      }
+    }
+  }
+  return {std::make_unique<table>(std::move(out_columns)), std::move(metadata)};
+}
+
 /**
  * @brief Create a serialized trie for N/A value matching, based on the options.
  */
@@ -807,33 +923,17 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   return parse_opts;
 }
 
-reader::impl::impl(std::unique_ptr<datasource> source,
-                   csv_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : mr_(mr), source_(std::move(source)), opts_(options)
-{
-  num_actual_cols_ = opts_.get_names().size();
-  num_active_cols_ = num_actual_cols_;
-
-  opts = make_parse_options(options, stream);
-}
+}  // namespace
 
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               csv_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
+                             csv_reader_options const& options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
-  _impl = std::make_unique<impl>(std::move(sources[0]), options, stream, mr);
-}
+  auto parse_options = make_parse_options(options, stream);
 
-// Destructor within this translation unit
-reader::~reader() = default;
-
-// Forward to implementation
-table_with_metadata reader::read(rmm::cuda_stream_view stream) { return _impl->read(stream); }
+  return read_csv(source.get(), options, parse_options, stream, mr);
+}
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.hpp b/cpp/src/io/csv/reader_impl.hpp
deleted file mode 100644
index de363a46ffe..00000000000
--- a/cpp/src/io/csv/reader_impl.hpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "csv_common.h"
-#include "csv_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
-#include <io/utilities/trie.cuh>
-
-#include <cudf/io/csv.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/csv.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using cudf::host_span;
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
-using namespace cudf::io::csv;
-using namespace cudf::io;
-
-/**
- * @brief Implementation for CSV reader
- *
- * The CSV reader is implemented in 4 stages:
- * Stage 1: read and optionally decompress the input data in host memory
- * (may be a memory-mapped view of the data on disk)
- *
- * Stage 2: gather the offset of each data row within the csv data.
- * Since the number of rows in a given character block may depend on the
- * initial parser state (like whether the block starts in a middle of a
- * quote or not), a separate row count and output parser state is computed
- * for every possible input parser state per 16KB character block.
- * The result is then used to infer the parser state and starting row at
- * the beginning of every character block.
- * A second pass can then output the location of every row (which is needed
- * for the subsequent parallel conversion of every row from csv text
- * to cudf binary form)
- *
- * Stage 3: Optional stage to infer the data type of each CSV column.
- *
- * Stage 4: Convert every row from csv text form to cudf binary form.
- */
-class reader::impl {
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   *
-   * @param source Dataset source
-   * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @param mr Device memory resource to use for device memory allocation
-   */
-  explicit impl(std::unique_ptr<datasource> source,
-                csv_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data and returns a set of columns.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return The set of columns along with metadata
-   */
-  table_with_metadata read(rmm::cuda_stream_view stream);
-
- private:
-  /**
-   * @brief Offsets of CSV rows in device memory, accessed through a shrinkable span.
-   *
-   * Row offsets are stored this way to avoid reallocation/copies when discarding front or back
-   * elements.
-   */
-  class selected_rows_offsets {
-    rmm::device_uvector<uint64_t> all;
-    device_span<uint64_t const> selected;
-
-   public:
-    selected_rows_offsets(rmm::device_uvector<uint64_t>&& data,
-                          device_span<uint64_t const> selected_span)
-      : all{std::move(data)}, selected{selected_span}
-    {
-    }
-    selected_rows_offsets(rmm::cuda_stream_view stream) : all{0, stream}, selected{all} {}
-
-    operator device_span<uint64_t const>() const { return selected; }
-    void shrink(size_t size)
-    {
-      CUDF_EXPECTS(size <= selected.size(), "New size must be smaller");
-      selected = selected.subspan(0, size);
-    }
-    void erase_first_n(size_t n)
-    {
-      CUDF_EXPECTS(n <= selected.size(), "Too many elements to remove");
-      selected = selected.subspan(n, selected.size() - n);
-    }
-    auto size() const { return selected.size(); }
-    auto data() const { return selected.data(); }
-  };
-
-  /**
-   * @brief Selectively loads data on the GPU and gathers offsets of rows to read.
-   *
-   * Selection is based on read options.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  select_data_and_row_offsets(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds row positions in the specified input data, and loads the selected data onto GPU.
-   *
-   * This function scans the input data to record the row offsets (relative to the start of the
-   * input data). A row is actually the data/offset between two termination symbols.
-   *
-   * @param data Uncompressed input data in host memory
-   * @param range_begin Only include rows starting after this position
-   * @param range_end Only include rows starting before this position
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read; -1: all remaining data
-   * @param load_whole_file Hint that the entire data will be needed on gpu
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Input data and row offsets in the device memory
-   */
-  std::pair<rmm::device_uvector<char>, reader::impl::selected_rows_offsets>
-  load_data_and_gather_row_offsets(host_span<char const> data,
-                                   size_t range_begin,
-                                   size_t range_end,
-                                   size_t skip_rows,
-                                   int64_t num_rows,
-                                   bool load_whole_file,
-                                   rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Find the start position of the first data row
-   *
-   * @param h_data Uncompressed input data in host memory
-   *
-   * @return Byte position of the first row
-   */
-  size_t find_first_row_start(host_span<char const> data);
-
-  /**
-   * @brief Automatically infers each column's data type based on the CSV's data within that column.
-   *
-   * @param data The CSV data from which to infer the columns' data types
-   * @param row_offsets The row offsets into the CSV's data
-   * @param stream The stream to which the type inference-kernel will be dispatched
-   * @return The columns' inferred data types
-   */
-  std::vector<data_type> infer_column_types(device_span<char const> data,
-                                            device_span<uint64_t const> row_offsets,
-                                            rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Selects the columns' data types from the map of dtypes.
-   *
-   * @param col_type_map Column name -> data type map specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::map<std::string, data_type> const& col_type_map);
-
-  /**
-   * @brief Selects the columns' data types from the list of dtypes.
-   *
-   * @param dtypes Vector of data types specifying the columns' target data types
-   * @return Sorted list of selected columns' data types
-   */
-  std::vector<data_type> select_data_types(std::vector<data_type> const& dtypes);
-
-  /**
-   * @brief Converts the row-column data and outputs to column bufferrs.
-   *
-   * @param column_types Column types
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return list of column buffers of decoded data, or ptr/size in the case of strings.
-   */
-  std::vector<column_buffer> decode_data(device_span<char const> data,
-                                         device_span<uint64_t const> row_offsets,
-                                         host_span<data_type const> column_types,
-                                         rmm::cuda_stream_view stream);
-
- private:
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-  std::unique_ptr<datasource> source_;
-  const csv_reader_options opts_;
-
-  cudf::size_type num_records_ = 0;  // Number of rows with actual data
-  int num_active_cols_         = 0;  // Number of columns to read
-  int num_actual_cols_         = 0;  // Number of columns in the dataset
-
-  // Parsing options
-  parse_options opts{};
-  std::vector<column_parse::flags> column_flags_;
-
-  // Intermediate data
-  std::vector<std::string> col_names_;
-  std::vector<char> header_;
-};
-
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 9a0c701ea49..e8c673751db 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -22,7 +22,7 @@
 #include "writer_impl.hpp"
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -285,8 +285,16 @@ void writer::impl::write_chunked_begin(table_view const& table,
                                        const table_metadata* metadata,
                                        rmm::cuda_stream_view stream)
 {
-  if ((metadata != nullptr) && (options_.is_enabled_include_header())) {
-    auto const& column_names = metadata->column_names;
+  if (options_.is_enabled_include_header()) {
+    // need to generate column names if metadata is not provided
+    std::vector<std::string> generated_col_names;
+    if (metadata == nullptr) {
+      generated_col_names.resize(table.num_columns());
+      thrust::tabulate(generated_col_names.begin(), generated_col_names.end(), [](auto idx) {
+        return std::to_string(idx);
+      });
+    }
+    auto const& column_names = (metadata == nullptr) ? generated_col_names : metadata->column_names;
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
@@ -360,7 +368,7 @@ void writer::impl::write_chunked(strings_column_view const& str_column_view,
   strings_column_view strings_column{p_str_col_w_nl->view()};
 
   auto total_num_bytes      = strings_column.chars_size();
-  char const* ptr_all_bytes = strings_column.chars().data<char>();
+  char const* ptr_all_bytes = strings_column.chars_begin();
 
   if (out_sink_->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
@@ -423,7 +431,7 @@ void writer::impl::write(table_view const& table,
       });
 
       // split table_view into chunks:
-      vector_views = cudf::split(table, splits);
+      vector_views = cudf::detail::split(table, splits, stream);
     }
 
     // convert each chunk to CSV:
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 511a1a22ee7..b678941db21 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -183,8 +183,6 @@ compression_type infer_compression_type(compression_type compression, source_inf
 
 table_with_metadata read_json(json_reader_options options, rmm::mr::device_memory_resource* mr)
 {
-  namespace json = cudf::io::detail::json;
-
   CUDF_FUNC_RANGE();
 
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
@@ -193,16 +191,11 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  auto reader =
-    std::make_unique<json::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
-
-  return reader->read(options);
+  return detail::json::read_json(datasources, options, rmm::cuda_stream_default, mr);
 }
 
 table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
 {
-  namespace csv = cudf::io::detail::csv;
-
   CUDF_FUNC_RANGE();
 
   options.set_compression(infer_compression_type(options.get_compression(), options.get_source()));
@@ -211,10 +204,13 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  auto reader =
-    std::make_unique<csv::reader>(std::move(datasources), options, rmm::cuda_stream_default, mr);
+  CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
-  return reader->read();
+  return cudf::io::detail::csv::read_csv(  //
+    std::move(datasources[0]),
+    options,
+    rmm::cuda_stream_default,
+    mr);
 }
 
 // Freeform API wraps the detail writer class API
@@ -255,7 +251,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 
   // Get column names
   for (auto i = 0; i < metadata.get_num_columns(); i++) {
-    result.column_names.push_back(metadata.get_column_name(i));
+    result.column_names.push_back(metadata.column_name(i));
   }
 
   // Get file-level statistics, statistics of each column of file
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index 64761ca4e92..b5ed43558d2 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -14,16 +14,17 @@
  * limitations under the License.
  */
 
-#include "json_common.h"
 #include "json_gpu.h"
 
 #include <io/csv/datetime.cuh>
+#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
 
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/list_view.cuh>
 #include <cudf/strings/string_view.cuh>
+#include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -334,19 +335,19 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
                                                   const char* end,
                                                   parse_options_view const& opts,
                                                   cudf::size_type field_idx,
-                                                  col_map_type* col_map)
+                                                  col_map_type col_map)
 {
   auto const desc_pre_trim =
-    col_map == nullptr
+    col_map.capacity() == 0
       // No key - column and begin are trivial
       ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
           auto const key_hash  = MurmurHash3_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
-          auto const hash_col = col_map->find(key_hash);
+          auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
-          auto const column = (hash_col != col_map->end()) ? (*hash_col).second : field_idx;
+          auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx;
 
           // Skip the colon between the key and the value
           auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
@@ -401,7 +402,7 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
                                                device_span<char const> const data,
                                                device_span<uint64_t const> const row_offsets,
                                                device_span<data_type const> const column_types,
-                                               col_map_type* col_map,
+                                               col_map_type col_map,
                                                device_span<void* const> const output_columns,
                                                device_span<bitmask_type* const> const valid_fields,
                                                device_span<cudf::size_type> const num_valid_fields)
@@ -421,6 +422,8 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
 
     current = desc.value_end + 1;
 
+    using string_index_pair = thrust::pair<const char*, size_type>;
+
     // Empty fields are not legal values
     if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
       // Type dispatcher does not handle strings
@@ -472,14 +475,14 @@ __global__ void detect_data_types_kernel(
   parse_options_view const opts,
   device_span<char const> const data,
   device_span<uint64_t const> const row_offsets,
-  col_map_type* col_map,
+  col_map_type col_map,
   int num_columns,
   device_span<cudf::io::column_type_histogram> const column_infos)
 {
   auto const rec_id = threadIdx.x + (blockDim.x * blockIdx.x);
   if (rec_id >= row_offsets.size()) return;
 
-  auto const are_rows_objects = col_map != nullptr;
+  auto const are_rows_objects = col_map.capacity() != 0;
   auto const row_data_range   = get_row_data_range(data, row_offsets, rec_id);
 
   size_type input_field_index = 0;
@@ -678,8 +681,14 @@ void convert_json_to_columns(parse_options_view const& opts,
 
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
-  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    opts, data, row_offsets, column_types, col_map, output_columns, valid_fields, num_valid_fields);
+  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(opts,
+                                                                               data,
+                                                                               row_offsets,
+                                                                               column_types,
+                                                                               *col_map,
+                                                                               output_columns,
+                                                                               valid_fields,
+                                                                               num_valid_fields);
 
   CUDA_TRY(cudaGetLastError());
 }
@@ -724,7 +733,7 @@ std::vector<cudf::io::column_type_histogram> detect_data_types(
   const int grid_size = (row_offsets.size() + block_size - 1) / block_size;
 
   detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, row_offsets, col_map, num_columns, d_column_infos);
+    options, data, row_offsets, *col_map, num_columns, d_column_infos);
 
   return cudf::detail::make_std_vector_sync(d_column_infos, stream);
 }
diff --git a/cpp/src/io/json/json_gpu.h b/cpp/src/io/json/json_gpu.h
index 7a6bce5e5a5..92024c3e8e6 100644
--- a/cpp/src/io/json/json_gpu.h
+++ b/cpp/src/io/json/json_gpu.h
@@ -16,8 +16,8 @@
 
 #pragma once
 
+#include <io/utilities/column_type_histogram.hpp>
 #include <io/utilities/parsing_utils.cuh>
-#include "json_common.h"
 
 #include <hash/concurrent_unordered_map.cuh>
 
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 799c9b3451e..0d819930ac9 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -14,14 +14,12 @@
  * limitations under the License.
  */
 
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO JSON reader class implementation
- */
+#include "json_gpu.h"
 
-#include "reader_impl.hpp"
+#include <hash/concurrent_unordered_map.cuh>
 
 #include <io/comp/io_uncomp.h>
+#include <io/utilities/column_buffer.hpp>
 #include <io/utilities/parsing_utils.cuh>
 #include <io/utilities/type_conversion.hpp>
 
@@ -29,15 +27,19 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/detail/json.hpp>
+#include <cudf/io/json.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
-#include <io/utilities/trie.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/optional.h>
@@ -48,7 +50,12 @@ namespace cudf {
 namespace io {
 namespace detail {
 namespace json {
+
 using namespace cudf::io;
+using namespace cudf::io::json;
+
+using col_map_type     = cudf::io::json::gpu::col_map_type;
+using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
 
 /**
  * @brief Aggregate the table containing keys info by their hash values.
@@ -88,7 +95,7 @@ std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
 col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
                                            rmm::cuda_stream_view stream)
 {
-  auto key_col_map{col_map_type::create(column_name_hashes.size(), stream)};
+  auto key_col_map       = col_map_type::create(column_name_hashes.size(), stream);
   auto const column_data = column_name_hashes.data<uint32_t>();
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
@@ -111,7 +118,7 @@ col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
  *
  * @return std::unique_ptr<table> cudf table with three columns (offsets, lengths, hashes)
  */
-std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& options,
+std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& parse_opts,
                                                    device_span<char const> const data,
                                                    device_span<uint64_t const> const row_offsets,
                                                    rmm::cuda_stream_view stream)
@@ -119,7 +126,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& opt
   // Count keys
   rmm::device_scalar<unsigned long long int> key_counter(0, stream);
   cudf::io::json::gpu::collect_keys_info(
-    options, data, row_offsets, key_counter.data(), {}, stream);
+    parse_opts, data, row_offsets, key_counter.data(), {}, stream);
 
   // Allocate columns to store hash value, length, and offset of each JSON object key in the input
   auto const num_keys = key_counter.value(stream);
@@ -135,7 +142,7 @@ std::unique_ptr<table> create_json_keys_info_table(const parse_options_view& opt
   key_counter.set_value_to_zero_async(stream);
   // Fill the allocated columns
   cudf::io::json::gpu::collect_keys_info(
-    options, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
+    parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
   return info_table;
 }
 
@@ -183,113 +190,91 @@ auto sort_keys_info_by_offset(std::unique_ptr<table> info)
  *
  * @return Names of JSON object keys in the file
  */
-std::pair<std::vector<std::string>, col_map_ptr_type> reader::impl::get_json_object_keys_hashes(
-  device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream)
+std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
+  parse_options_view const& parse_opts,
+  host_span<char const> h_data,
+  device_span<uint64_t const> rec_starts,
+  device_span<char const> d_data,
+  rmm::cuda_stream_view stream)
 {
-  auto info = create_json_keys_info_table(
-    opts_.view(),
-    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-    rec_starts,
-    stream);
+  auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream);
 
   auto aggregated_info = aggregate_keys_info(std::move(info));
   auto sorted_info     = sort_keys_info_by_offset(std::move(aggregated_info));
 
-  return {create_key_strings(uncomp_data_, sorted_info->view(), stream),
+  return {create_key_strings(h_data.data(), sorted_info->view(), stream),
           create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
 }
 
-/**
- * @brief Ingest input JSON file/buffer, without decompression.
- *
- * Sets the sources_, byte_range_offset_, and byte_range_size_ data members
- *
- * @param[in] range_offset Number of bytes offset from the start
- * @param[in] range_size Bytes to read; use `0` for all remaining data
- * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
- */
-void reader::impl::ingest_raw_input(size_t range_offset,
-                                    size_t range_size,
-                                    size_t range_size_padded)
+std::vector<char> ingest_raw_input(std::vector<std::unique_ptr<datasource>> const& sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   size_t range_size_padded)
 {
   // Iterate through the user defined sources and read the contents into the local buffer
-  CUDF_EXPECTS(!sources_.empty(), "No sources were defined");
   size_t total_source_size = 0;
-  for (const auto& source : sources_) {
+  for (const auto& source : sources) {
     total_source_size += source->size();
   }
-  total_source_size = total_source_size - range_offset;
+  total_source_size = total_source_size - (range_offset * sources.size());
+
+  auto buffer = std::vector<char>(total_source_size);
 
-  buffer_.resize(total_source_size);
   size_t bytes_read = 0;
-  for (const auto& source : sources_) {
+  for (const auto& source : sources) {
     if (!source->is_empty()) {
-      auto data_size = (range_size_padded != 0) ? range_size_padded : source->size();
-      bytes_read += source->host_read(range_offset, data_size, &buffer_[bytes_read]);
+      auto data_size   = (range_size_padded != 0) ? range_size_padded : source->size();
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      bytes_read += source->host_read(range_offset, data_size, destination);
     }
   }
 
-  byte_range_offset_ = range_offset;
-  byte_range_size_   = range_size;
-  load_whole_source_ = byte_range_offset_ == 0 && byte_range_size_ == 0;
+  if (compression == compression_type::NONE) {
+    return buffer;
+  } else {
+    return get_uncompressed_data(buffer, compression);
+  }
 }
 
-/**
- * @brief Decompress the input data, if needed
- *
- * Sets the uncomp_data_ and uncomp_size_ data members
- * Loads the data into device memory if byte range parameters are not used
- */
-void reader::impl::decompress_input(rmm::cuda_stream_view stream)
+bool should_load_whole_source(json_reader_options const& reader_opts)
 {
-  if (options_.get_compression() == compression_type::NONE) {
-    // Do not use the owner vector here to avoid extra copy
-    uncomp_data_ = reinterpret_cast<const char*>(buffer_.data());
-    uncomp_size_ = buffer_.size();
-  } else {
-    uncomp_data_owner_ = get_uncompressed_data(  //
-      host_span<char const>(                     //
-        reinterpret_cast<const char*>(buffer_.data()),
-        buffer_.size()),
-      options_.get_compression());
-
-    uncomp_data_ = uncomp_data_owner_.data();
-    uncomp_size_ = uncomp_data_owner_.size();
-  }
-  if (load_whole_source_) data_ = rmm::device_buffer(uncomp_data_, uncomp_size_, stream);
+  return reader_opts.get_byte_range_offset() == 0 and  //
+         reader_opts.get_byte_range_size() == 0;
 }
 
-rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_view stream)
+rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
+                                                 host_span<char const> h_data,
+                                                 device_span<char const> d_data,
+                                                 rmm::cuda_stream_view stream)
 {
   std::vector<char> chars_to_count{'\n'};
   // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
   // and then filtering out the records that is a quotechar or a linetermination within a quotechar
   // pair.
-  if (allow_newlines_in_strings_) { chars_to_count.push_back('\"'); }
   // If not starting at an offset, add an extra row to account for the first row in the file
-  cudf::size_type prefilter_count = ((byte_range_offset_ == 0) ? 1 : 0);
-  if (load_whole_source_) {
-    prefilter_count += count_all_from_set(data_, chars_to_count, stream);
+  cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0);
+  if (should_load_whole_source(reader_opts)) {
+    prefilter_count += count_all_from_set(d_data, chars_to_count, stream);
   } else {
-    prefilter_count += count_all_from_set(uncomp_data_, uncomp_size_, chars_to_count, stream);
+    prefilter_count += count_all_from_set(h_data, chars_to_count, stream);
   }
 
   rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
 
   auto* find_result_ptr = rec_starts.data();
   // Manually adding an extra row to account for the first row in the file
-  if (byte_range_offset_ == 0) {
+  if (reader_opts.get_byte_range_offset() == 0) {
     find_result_ptr++;
     CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
   }
 
   std::vector<char> chars_to_find{'\n'};
-  if (allow_newlines_in_strings_) { chars_to_find.push_back('\"'); }
   // Passing offset = 1 to return positions AFTER the found character
-  if (load_whole_source_) {
-    find_all_from_set(data_, chars_to_find, 1, find_result_ptr, stream);
+  if (should_load_whole_source(reader_opts)) {
+    find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream);
   } else {
-    find_all_from_set(uncomp_data_, uncomp_size_, chars_to_find, 1, find_result_ptr, stream);
+    find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream);
   }
 
   // Previous call stores the record pinput_file.typeositions as encountered by all threads
@@ -298,30 +283,9 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
   thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
 
   auto filtered_count = prefilter_count;
-  if (allow_newlines_in_strings_) {
-    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
-    bool quotation    = false;
-    for (cudf::size_type i = 1; i < prefilter_count; ++i) {
-      if (uncomp_data_[h_rec_starts[i] - 1] == '\"') {
-        quotation       = !quotation;
-        h_rec_starts[i] = uncomp_size_;
-        filtered_count--;
-      } else if (quotation) {
-        h_rec_starts[i] = uncomp_size_;
-        filtered_count--;
-      }
-    }
-    CUDA_TRY(cudaMemcpyAsync(rec_starts.data(),
-                             h_rec_starts.data(),
-                             h_rec_starts.size() * sizeof(uint64_t),
-                             cudaMemcpyDefault,
-                             stream.value()));
-    thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
-    stream.synchronize();
-  }
 
   // Exclude the ending newline as it does not precede a record start
-  if (uncomp_data_[uncomp_size_ - 1] == '\n') { filtered_count--; }
+  if (h_data.back() == '\n') { filtered_count--; }
   rec_starts.resize(filtered_count, stream);
 
   return rec_starts;
@@ -334,50 +298,54 @@ rmm::device_uvector<uint64_t> reader::impl::find_record_starts(rmm::cuda_stream_
  * Only rows that need to be parsed are copied, based on the byte range
  * Also updates the array of record starts to match the device data offset.
  */
-void reader::impl::upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
-                                         rmm::cuda_stream_view stream)
+rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
+                                                host_span<char const> h_data,
+                                                rmm::device_uvector<uint64_t>& rec_starts,
+                                                rmm::cuda_stream_view stream)
 {
-  size_t start_offset = 0;
-  size_t end_offset   = uncomp_size_;
+  size_t end_offset = h_data.size();
 
   // Trim lines that are outside range
-  if (byte_range_size_ != 0 || byte_range_offset_ != 0) {
-    auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
-
-    if (byte_range_size_ != 0) {
-      auto it = h_rec_starts.end() - 1;
-      while (it >= h_rec_starts.begin() && *it > byte_range_size_) {
-        end_offset = *it;
-        --it;
-      }
-      h_rec_starts.erase(it + 1, h_rec_starts.end());
-    }
+  auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
 
-    // Resize to exclude rows outside of the range
-    // Adjust row start positions to account for the data subcopy
-    start_offset = h_rec_starts.front();
-    rec_starts.resize(h_rec_starts.size(), stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      rec_starts.begin(),
-                      rec_starts.end(),
-                      thrust::make_constant_iterator(start_offset),
-                      rec_starts.begin(),
-                      thrust::minus<uint64_t>());
+  if (reader_opts.get_byte_range_size() != 0) {
+    auto it = h_rec_starts.end() - 1;
+    while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) {
+      end_offset = *it;
+      --it;
+    }
+    h_rec_starts.erase(it + 1, h_rec_starts.end());
   }
 
+  // Resize to exclude rows outside of the range
+  // Adjust row start positions to account for the data subcopy
+  size_t start_offset = h_rec_starts.front();
+  rec_starts.resize(h_rec_starts.size(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    rec_starts.begin(),
+                    rec_starts.end(),
+                    thrust::make_constant_iterator(start_offset),
+                    rec_starts.begin(),
+                    thrust::minus<uint64_t>());
+
   const size_t bytes_to_upload = end_offset - start_offset;
-  CUDF_EXPECTS(bytes_to_upload <= uncomp_size_,
+  CUDF_EXPECTS(bytes_to_upload <= h_data.size(),
                "Error finding the record within the specified byte range.\n");
 
   // Upload the raw data that is within the rows of interest
-  data_ = rmm::device_buffer(uncomp_data_ + start_offset, bytes_to_upload, stream);
+  return cudf::detail::make_device_uvector_async(h_data.subspan(start_offset, bytes_to_upload),
+                                                 stream);
 }
 
-void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
-                                    rmm::cuda_stream_view stream)
+std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
+  parse_options_view const& parse_opts,
+  host_span<char const> h_data,
+  device_span<uint64_t const> rec_starts,
+  device_span<char const> d_data,
+  rmm::cuda_stream_view stream)
 {
   // If file only contains one row, use the file size for the row size
-  uint64_t first_row_len = data_.size() / sizeof(char);
+  uint64_t first_row_len = d_data.size() / sizeof(char);
   if (rec_starts.size() > 1) {
     // Set first_row_len to the offset of the second row, if it exists
     CUDA_TRY(cudaMemcpyAsync(&first_row_len,
@@ -388,7 +356,7 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   }
   std::vector<char> first_row(first_row_len);
   CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                           data_.data(),
+                           d_data.data(),
                            first_row_len * sizeof(char),
                            cudaMemcpyDeviceToHost,
                            stream.value()));
@@ -405,64 +373,64 @@ void reader::impl::set_column_names(device_span<uint64_t const> rec_starts,
   // If the first opening bracket is '{', assume object format
   if (first_curly_bracket < first_square_bracket) {
     // use keys as column names if input rows are objects
-    auto keys_desc         = get_json_object_keys_hashes(rec_starts, stream);
-    metadata_.column_names = keys_desc.first;
-    set_column_map(std::move(keys_desc.second), stream);
+    return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream);
   } else {
-    int cols_found = 0;
-    bool quotation = false;
+    int cols_found    = 0;
+    bool quotation    = false;
+    auto column_names = std::vector<std::string>();
     for (size_t pos = 0; pos < first_row.size(); ++pos) {
       // Flip the quotation flag if current character is a quotechar
-      if (first_row[pos] == opts_.quotechar) {
+      if (first_row[pos] == parse_opts.quotechar) {
         quotation = !quotation;
       }
       // Check if end of a column/row
-      else if (pos == first_row.size() - 1 || (!quotation && first_row[pos] == opts_.delimiter)) {
-        metadata_.column_names.emplace_back(std::to_string(cols_found++));
+      else if (pos == first_row.size() - 1 ||
+               (!quotation && first_row[pos] == parse_opts.delimiter)) {
+        column_names.emplace_back(std::to_string(cols_found++));
       }
     }
+    return {column_names, col_map_type::create(0, stream)};
   }
 }
 
-void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
-                                  rmm::cuda_stream_view stream)
+std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
+                                      parse_options_view const& parse_opts,
+                                      std::vector<std::string> const& column_names,
+                                      col_map_type* column_map,
+                                      device_span<uint64_t const> rec_starts,
+                                      device_span<char const> data,
+                                      rmm::cuda_stream_view stream)
 {
   bool has_to_infer_column_types =
-    std::visit([](const auto& dtypes) { return dtypes.empty(); }, options_.get_dtypes());
+    std::visit([](const auto& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
+
   if (!has_to_infer_column_types) {
-    dtypes_ = std::visit(cudf::detail::visitor_overload{
-                           [&](const std::vector<data_type>& dtypes) {
-                             CUDF_EXPECTS(dtypes.size() == metadata_.column_names.size(),
-                                          "Must specify types for all columns");
-                             return dtypes;
-                           },
-                           [&](const std::map<std::string, data_type>& dtypes) {
-                             std::vector<data_type> sorted_dtypes;
-                             std::transform(std::cbegin(metadata_.column_names),
-                                            std::cend(metadata_.column_names),
-                                            std::back_inserter(sorted_dtypes),
-                                            [&](auto const& column_name) {
-                                              auto const it = dtypes.find(column_name);
-                                              CUDF_EXPECTS(it != dtypes.end(),
-                                                           "Must specify types for all columns");
-                                              return it->second;
-                                            });
-                             return sorted_dtypes;
-                           }},
-                         options_.get_dtypes());
+    return std::visit(
+      cudf::detail::visitor_overload{
+        [&](const std::vector<data_type>& dtypes) {
+          CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns");
+          return dtypes;
+        },
+        [&](const std::map<std::string, data_type>& dtypes) {
+          std::vector<data_type> sorted_dtypes;
+          std::transform(std::cbegin(column_names),
+                         std::cend(column_names),
+                         std::back_inserter(sorted_dtypes),
+                         [&](auto const& column_name) {
+                           auto const it = dtypes.find(column_name);
+                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
+                           return it->second;
+                         });
+          return sorted_dtypes;
+        }},
+      reader_opts.get_dtypes());
   } else {
     CUDF_EXPECTS(rec_starts.size() != 0, "No data available for data type inference.\n");
-    auto const num_columns       = metadata_.column_names.size();
-    auto const do_set_null_count = key_to_col_idx_map_ != nullptr;
+    auto const num_columns       = column_names.size();
+    auto const do_set_null_count = column_map->capacity() > 0;
 
     auto const h_column_infos = cudf::io::json::gpu::detect_data_types(
-      opts_.view(),
-      device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-      rec_starts,
-      do_set_null_count,
-      num_columns,
-      get_column_map_device_ptr(),
-      stream);
+      parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);
 
     auto get_type_id = [&](auto const& cinfo) {
       auto int_count_total =
@@ -489,23 +457,33 @@ void reader::impl::set_data_types(device_span<uint64_t const> rec_starts,
       }
     };
 
+    std::vector<data_type> dtypes;
+
     std::transform(std::cbegin(h_column_infos),
                    std::cend(h_column_infos),
-                   std::back_inserter(dtypes_),
+                   std::back_inserter(dtypes),
                    [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; });
+
+    return dtypes;
   }
 }
 
-table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                                        rmm::cuda_stream_view stream)
+table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
+                                          std::vector<data_type> const& dtypes,
+                                          std::vector<std::string> const& column_names,
+                                          col_map_type* column_map,
+                                          device_span<uint64_t const> rec_starts,
+                                          device_span<char const> data,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
 {
-  const auto num_columns = dtypes_.size();
+  const auto num_columns = dtypes.size();
   const auto num_records = rec_starts.size();
 
   // alloc output buffers.
   std::vector<column_buffer> out_buffers;
   for (size_t col = 0; col < num_columns; ++col) {
-    out_buffers.emplace_back(dtypes_[col], num_records, true, stream, mr_);
+    out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr);
   }
 
   thrust::host_vector<data_type> h_dtypes(num_columns);
@@ -513,7 +491,7 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   thrust::host_vector<bitmask_type*> h_valid(num_columns);
 
   for (size_t i = 0; i < num_columns; ++i) {
-    h_dtypes[i] = dtypes_[i];
+    h_dtypes[i] = dtypes[i];
     h_data[i]   = out_buffers[i].data();
     h_valid[i]  = out_buffers[i].null_mask();
   }
@@ -525,15 +503,7 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
     cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(num_columns, stream);
 
   cudf::io::json::gpu::convert_json_to_columns(
-    opts_.view(),
-    device_span<char const>(static_cast<char const*>(data_.data()), data_.size()),
-    rec_starts,
-    d_dtypes,
-    get_column_map_device_ptr(),
-    d_data,
-    d_valid,
-    d_valid_counts,
-    stream);
+    parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
 
   stream.synchronize();
 
@@ -560,11 +530,11 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
   for (size_t i = 0; i < num_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
 
-    auto out_column = make_column(out_buffers[i], nullptr, stream, mr_);
+    auto out_column = make_column(out_buffers[i], nullptr, stream, mr);
     if (out_column->type().id() == type_id::STRING) {
       // Need to remove escape character in case of '\"' and '\\'
       out_columns.emplace_back(cudf::strings::detail::replace(
-        out_column->view(), target->view(), repl->view(), stream, mr_));
+        out_column->view(), target->view(), repl->view(), stream, mr));
     } else {
       out_columns.emplace_back(std::move(out_column));
     }
@@ -576,22 +546,7 @@ table_with_metadata reader::impl::convert_data_to_table(device_span<uint64_t con
 
   CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
 
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), metadata_};
-}
-
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   json_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : options_(options), mr_(mr), sources_(std::move(sources))
-{
-  CUDF_EXPECTS(options_.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
-
-  opts_.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  opts_.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  opts_.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
-
-  opts_.dayfirst = options.is_enabled_dayfirst();
+  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_names}};
 }
 
 /**
@@ -603,52 +558,65 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
  *
  * @return Table and its metadata
  */
-table_with_metadata reader::impl::read(json_reader_options const& options,
-                                       rmm::cuda_stream_view stream)
+table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
+                              json_reader_options const& reader_opts,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
-  auto range_offset      = options.get_byte_range_offset();
-  auto range_size        = options.get_byte_range_size();
-  auto range_size_padded = options.get_byte_range_size_with_padding();
+  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
 
-  ingest_raw_input(range_offset, range_size, range_size_padded);
-  CUDF_EXPECTS(buffer_.size() != 0, "Ingest failed: input data is null.\n");
+  CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
 
-  decompress_input(stream);
-  CUDF_EXPECTS(uncomp_data_ != nullptr, "Ingest failed: uncompressed input data is null.\n");
-  CUDF_EXPECTS(uncomp_size_ != 0, "Ingest failed: uncompressed input data has zero size.\n");
+  auto parse_opts = parse_options{',', '\n', '\"', '.'};
 
-  auto rec_starts = find_record_starts(stream);
-  CUDF_EXPECTS(!rec_starts.is_empty(), "Error enumerating records.\n");
+  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
+  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
+  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
 
-  upload_data_to_device(rec_starts, stream);
-  CUDF_EXPECTS(data_.size() != 0, "Error uploading input data to the GPU.\n");
+  parse_opts.dayfirst = reader_opts.is_enabled_dayfirst();
 
-  set_column_names(rec_starts, stream);
-  CUDF_EXPECTS(!metadata_.column_names.empty(), "Error determining column names.\n");
+  auto range_offset      = reader_opts.get_byte_range_offset();
+  auto range_size        = reader_opts.get_byte_range_size();
+  auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
 
-  set_data_types(rec_starts, stream);
-  CUDF_EXPECTS(!dtypes_.empty(), "Error in data type detection.\n");
+  auto h_data = ingest_raw_input(
+    sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded);
 
-  return convert_data_to_table(rec_starts, stream);
-}
+  CUDF_EXPECTS(h_data.size() != 0, "Ingest failed: uncompressed input data has zero size.\n");
 
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               json_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-{
-  _impl = std::make_unique<impl>(std::move(sources), options, stream, mr);
-}
+  auto d_data = rmm::device_uvector<char>(0, stream);
 
-// Destructor within this translation unit
-reader::~reader() = default;
+  if (should_load_whole_source(reader_opts)) {
+    d_data = cudf::detail::make_device_uvector_async(h_data, stream);
+  }
 
-// Forward to implementation
-table_with_metadata reader::read(json_reader_options const& options, rmm::cuda_stream_view stream)
-{
-  return table_with_metadata{_impl->read(options, stream)};
+  auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
+
+  CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n");
+
+  if (not should_load_whole_source(reader_opts)) {
+    d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream);
+  }
+
+  CUDF_EXPECTS(d_data.size() != 0, "Error uploading input data to the GPU.\n");
+
+  auto column_names_and_map =
+    get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream);
+
+  auto column_names = std::get<0>(column_names_and_map);
+  auto column_map   = std::move(std::get<1>(column_names_and_map));
+
+  CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
+
+  auto dtypes = get_data_types(
+    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream);
+
+  CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
+
+  return convert_data_to_table(
+    parse_opts.view(), dtypes, column_names, column_map.get(), rec_starts, d_data, stream, mr);
 }
+
 }  // namespace json
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/json/reader_impl.hpp b/cpp/src/io/json/reader_impl.hpp
deleted file mode 100644
index 4d14edf360a..00000000000
--- a/cpp/src/io/json/reader_impl.hpp
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file reader_impl.hpp
- * @brief cuDF-IO JSON reader class implementation header
- */
-
-#pragma once
-
-#include "json_common.h"
-#include "json_gpu.h"
-
-#include <io/utilities/column_buffer.hpp>
-
-#include <hash/concurrent_unordered_map.cuh>
-
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/json.hpp>
-#include <cudf/io/json.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace json {
-using namespace cudf::io::json;
-using namespace cudf::io;
-
-using col_map_type     = cudf::io::json::gpu::col_map_type;
-using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
-
-/**
- * @brief Class used to parse Json input and convert it into gdf columns.
- */
-class reader::impl {
- public:
- private:
-  const json_reader_options options_{};
-
-  rmm::mr::device_memory_resource* mr_ = nullptr;
-
-  std::vector<std::unique_ptr<datasource>> sources_;
-  std::vector<uint8_t> buffer_;
-
-  const char* uncomp_data_ = nullptr;
-  size_t uncomp_size_      = 0;
-
-  // Used when the input data is compressed, to ensure the allocated uncompressed data is freed
-  std::vector<char> uncomp_data_owner_;
-  rmm::device_buffer data_;
-
-  size_t byte_range_offset_ = 0;
-  size_t byte_range_size_   = 0;
-  bool load_whole_source_   = true;
-
-  table_metadata metadata_;
-  std::vector<data_type> dtypes_;
-
-  // the map is only used for files with rows in object format; initialize to a dummy value so the
-  // map object can be passed to the kernel in any case
-  col_map_ptr_type key_to_col_idx_map_;
-  std::unique_ptr<rmm::device_scalar<col_map_type>> d_key_col_map_;
-
-  // parsing options
-  const bool allow_newlines_in_strings_ = false;
-  parse_options opts_{',', '\n', '\"', '.'};
-
-  /**
-   * @brief Sets the column map data member and makes a device copy to be used as a kernel
-   * parameter.
-   */
-  void set_column_map(col_map_ptr_type&& map, rmm::cuda_stream_view stream)
-  {
-    key_to_col_idx_map_ = std::move(map);
-    d_key_col_map_ =
-      std::make_unique<rmm::device_scalar<col_map_type>>(*key_to_col_idx_map_, stream);
-  }
-  /**
-   * @brief Gets the pointer to the column hash map in the device memory.
-   *
-   * Returns `nullptr` if the map is not created.
-   */
-  auto get_column_map_device_ptr()
-  {
-    return key_to_col_idx_map_ ? d_key_col_map_->data() : nullptr;
-  }
-
-  /**
-   * @brief Ingest input JSON file/buffer, without decompression
-   *
-   * Sets the source_, byte_range_offset_, and byte_range_size_ data members
-   *
-   * @param[in] range_offset Number of bytes offset from the start
-   * @param[in] range_size Bytes to read; use `0` for all remaining data
-   * @param[in] range_size_padded Bytes to read with padding; use `0` for all remaining data
-   */
-  void ingest_raw_input(size_t range_offset, size_t range_size, size_t range_size_padded);
-
-  /**
-   * @brief Extract the JSON objects keys from the input file with object rows.
-   *
-   * @return Array of keys and a map that maps their hash values to column indices
-   */
-  std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-    device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Decompress the input data, if needed
-   *
-   * Sets the uncomp_data_ and uncomp_size_ data members
-   */
-  void decompress_input(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Finds all record starts in the file.
-   *
-   * Does not upload the entire file to the GPU
-   *
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   * @return Record starts in the device memory
-   */
-  rmm::device_uvector<uint64_t> find_record_starts(rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Uploads the relevant segment of the input json data onto the GPU.
-   *
-   * Sets the d_data_ data member.
-   * Only rows that need to be parsed are copied, based on the byte range
-   * Also updates the array of record starts to match the device data offset.
-   */
-  void upload_data_to_device(rmm::device_uvector<uint64_t>& rec_starts,
-                             rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the first row to set the column name
-   *
-   * Sets the column_names_ data member
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void set_column_names(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Set the data type array data member
-   *
-   * If user does not pass the data types, deduces types from the file content
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void set_data_types(device_span<uint64_t const> rec_starts, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Parse the input data and store results a table
-   *
-   * @param[in] rec_starts Record starts in device memory
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata convert_data_to_table(device_span<uint64_t const> rec_starts,
-                                            rmm::cuda_stream_view stream);
-
- public:
-  /**
-   * @brief Constructor from a dataset source with reader options.
-   */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                json_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
-
-  /**
-   * @brief Read an entire set or a subset of data from the source
-   *
-   * @param[in] options Settings for controlling reading behavior
-   * @param[in] stream CUDA stream used for device memory operations and kernel launches.
-   *
-   * @return Table and its metadata
-   */
-  table_with_metadata read(json_reader_options const& options, rmm::cuda_stream_view stream);
-};
-
-}  // namespace json
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
new file mode 100644
index 00000000000..45d60605936
--- /dev/null
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "aggregate_orc_metadata.hpp"
+
+#include <algorithm>
+#include <numeric>
+
+namespace cudf::io::orc::detail {
+
+column_hierarchy::column_hierarchy(nesting_map child_map) : children{std::move(child_map)}
+{
+  // Sort columns by nesting levels
+  std::function<void(size_type, int32_t)> levelize = [&](size_type id, int32_t level) {
+    if (static_cast<int32_t>(levels.size()) == level) levels.emplace_back();
+
+    levels[level].push_back({id, static_cast<int32_t>(children[id].size())});
+
+    for (auto child_id : children[id]) {
+      levelize(child_id, level + 1);
+    }
+  };
+
+  std::for_each(
+    children[0].cbegin(), children[0].cend(), [&](auto col_id) { levelize(col_id, 0); });
+}
+
+namespace {
+
+/**
+ * @brief Goes up to the root to include the column with the given id and its parents.
+ */
+void update_parent_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
+                           metadata const& metadata,
+                           size_type id)
+{
+  auto current_id = id;
+  while (metadata.column_has_parent(current_id)) {
+    auto parent_id = metadata.parent_id(current_id);
+    if (std::find(selected_columns[parent_id].cbegin(),
+                  selected_columns[parent_id].cend(),
+                  current_id) == selected_columns[parent_id].end()) {
+      selected_columns[parent_id].push_back(current_id);
+    }
+    current_id = parent_id;
+  }
+}
+
+/**
+ * @brief Adds all columns nested under the column with the given id to the nesting map.
+ */
+void add_nested_columns(std::map<size_type, std::vector<size_type>>& selected_columns,
+                        std::vector<SchemaType> const& types,
+                        size_type id)
+{
+  for (auto child_id : types[id].subtypes) {
+    if (std::find(selected_columns[id].cbegin(), selected_columns[id].cend(), child_id) ==
+        selected_columns[id].end()) {
+      selected_columns[id].push_back(child_id);
+    }
+    add_nested_columns(selected_columns, types, child_id);
+  }
+}
+
+/**
+ * @brief Adds the column with the given id to the mapping
+ *
+ * All nested columns and direct ancestors of column `id` are included.
+ * Columns that are not on the direct path are excluded, which may result in prunning.
+ */
+void add_column_to_mapping(std::map<size_type, std::vector<size_type>>& selected_columns,
+                           metadata const& metadata,
+                           size_type id)
+{
+  update_parent_mapping(selected_columns, metadata, id);
+  add_nested_columns(selected_columns, metadata.ff.types, id);
+}
+
+/**
+ * @brief Create a metadata object from each element in the source vector
+ */
+auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
+{
+  std::vector<metadata> metadatas;
+  std::transform(
+    sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
+      return metadata(source.get());
+    });
+  return metadatas;
+}
+
+}  // namespace
+
+size_type aggregate_orc_metadata::calc_num_rows() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto const& sum, auto const& pfm) {
+      return sum + pfm.get_total_rows();
+    });
+}
+
+size_type aggregate_orc_metadata::calc_num_stripes() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto const& sum, auto const& pfm) {
+      return sum + pfm.get_num_stripes();
+    });
+}
+
+aggregate_orc_metadata::aggregate_orc_metadata(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+  : per_file_metadata(metadatas_from_sources(sources)),
+    num_rows(calc_num_rows()),
+    num_stripes(calc_num_stripes())
+{
+  // Verify that the input files have the same number of columns,
+  // as well as matching types, compression, and names
+  for (auto const& pfm : per_file_metadata) {
+    CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(),
+                 "All sources must have the same number of columns");
+    CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression,
+                 "All sources must have the same compression type");
+
+    // Check the types, column names, and decimal scale
+    for (size_t i = 0; i < pfm.ff.types.size(); i++) {
+      CUDF_EXPECTS(pfm.ff.types[i].kind == per_file_metadata[0].ff.types[i].kind,
+                   "Column types across all input sources must be the same");
+      CUDF_EXPECTS(std::equal(pfm.ff.types[i].fieldNames.begin(),
+                              pfm.ff.types[i].fieldNames.end(),
+                              per_file_metadata[0].ff.types[i].fieldNames.begin()),
+                   "All source column names must be the same");
+      CUDF_EXPECTS(
+        pfm.ff.types[i].scale.value_or(0) == per_file_metadata[0].ff.types[i].scale.value_or(0),
+        "All scale values must be the same");
+    }
+  }
+}
+
+std::vector<metadata::stripe_source_mapping> aggregate_orc_metadata::select_stripes(
+  std::vector<std::vector<size_type>> const& user_specified_stripes,
+  size_type& row_start,
+  size_type& row_count)
+{
+  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+
+  if (!user_specified_stripes.empty()) {
+    CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
+                 "Must specify stripes for each source");
+    // row_start is 0 if stripes are set. If this is not true anymore, then
+    // row_start needs to be subtracted to get the correct row_count
+    CUDF_EXPECTS(row_start == 0, "Start row index should be 0");
+
+    row_count = 0;
+    // Each vector entry represents a source file; each nested vector represents the
+    // user_defined_stripes to get from that source file
+    for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
+      std::vector<OrcStripeInfo> stripe_infos;
+
+      // Coalesce stripe info at the source file later since that makes downstream processing much
+      // easier in impl::read
+      for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
+        CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
+                     "Invalid stripe index");
+        stripe_infos.push_back(
+          std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+        row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+      }
+      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+    }
+  } else {
+    row_start = std::max(row_start, 0);
+    if (row_count < 0) {
+      row_count = static_cast<size_type>(
+        std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
+    }
+    row_count = std::min(row_count, get_num_rows() - row_start);
+    CUDF_EXPECTS(row_count >= 0, "Invalid row count");
+    CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
+
+    size_type count            = 0;
+    size_type stripe_skip_rows = 0;
+    // Iterate all source files, each source file has corelating metadata
+    for (size_t src_file_idx = 0;
+         src_file_idx < per_file_metadata.size() && count < row_start + row_count;
+         ++src_file_idx) {
+      std::vector<OrcStripeInfo> stripe_infos;
+
+      for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
+                                  count < row_start + row_count;
+           ++stripe_idx) {
+        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        if (count > row_start || count == 0) {
+          stripe_infos.push_back(
+            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+        } else {
+          stripe_skip_rows = count;
+        }
+      }
+
+      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+    }
+    // Need to remove skipped rows from the stripes which are not selected.
+    row_start -= stripe_skip_rows;
+  }
+
+  // Read each stripe's stripefooter metadata
+  if (not selected_stripes_mapping.empty()) {
+    for (auto& mapping : selected_stripes_mapping) {
+      // Resize to all stripe_info for the source level
+      per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
+
+      for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
+        const auto stripe         = mapping.stripe_info[i].first;
+        const auto sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
+        const auto sf_comp_length = stripe->footerLength;
+        CUDF_EXPECTS(
+          sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
+          "Invalid stripe information");
+        const auto buffer =
+          per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
+        size_t sf_length = 0;
+        auto sf_data     = per_file_metadata[mapping.source_idx].decompressor->Decompress(
+          buffer->data(), sf_comp_length, &sf_length);
+        ProtobufReader(sf_data, sf_length)
+          .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
+        mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+        if (stripe->indexLength == 0) { row_grp_idx_present = false; }
+      }
+    }
+  }
+
+  return selected_stripes_mapping;
+}
+
+column_hierarchy aggregate_orc_metadata::select_columns(
+  std::vector<std::string> const& column_paths)
+{
+  auto const& pfm = per_file_metadata[0];
+
+  column_hierarchy::nesting_map selected_columns;
+  if (column_paths.empty()) {
+    for (auto const& col_id : pfm.ff.types[0].subtypes) {
+      add_column_to_mapping(selected_columns, pfm, col_id);
+    }
+  } else {
+    for (const auto& path : column_paths) {
+      bool name_found = false;
+      for (auto col_id = 1; col_id < pfm.get_num_columns(); ++col_id) {
+        if (pfm.column_path(col_id) == path) {
+          name_found = true;
+          add_column_to_mapping(selected_columns, pfm, col_id);
+          break;
+        }
+      }
+      CUDF_EXPECTS(name_found, "Unknown column name: " + std::string(path));
+    }
+  }
+  return {std::move(selected_columns)};
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
new file mode 100644
index 00000000000..356d20843e8
--- /dev/null
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "orc.h"
+
+#include <map>
+#include <vector>
+
+namespace cudf::io::orc::detail {
+
+/**
+ * @brief Describes a column hierarchy, which may exclude some input columns.
+ */
+struct column_hierarchy {
+  // Maps column IDs to the IDs of their children columns
+  using nesting_map = std::map<size_type, std::vector<size_type>>;
+  // Children IDs of each column
+  nesting_map children;
+  // Each element contains column at the given nesting level
+  std::vector<std::vector<orc_column_meta>> levels;
+
+  column_hierarchy(nesting_map child_map);
+  auto num_levels() const { return levels.size(); }
+};
+
+/**
+ * @brief In order to support multiple input files/buffers we need to gather
+ * the metadata across all of those input(s). This class provides a place
+ * to aggregate that metadata from all the files.
+ */
+class aggregate_orc_metadata {
+  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
+
+  /**
+   * @brief Sums up the number of rows of each source
+   */
+  size_type calc_num_rows() const;
+
+  /**
+   * @brief Number of columns in a ORC file.
+   */
+  size_type calc_num_cols() const;
+
+  /**
+   * @brief Sums up the number of stripes of each source
+   */
+  size_type calc_num_stripes() const;
+
+ public:
+  std::vector<metadata> per_file_metadata;
+  size_type const num_rows;
+  size_type const num_stripes;
+  bool row_grp_idx_present{true};
+
+  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
+
+  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
+
+  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
+
+  auto get_num_rows() const { return num_rows; }
+
+  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
+
+  auto get_num_stripes() const { return num_stripes; }
+
+  auto const& get_types() const { return per_file_metadata[0].ff.types; }
+
+  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
+
+  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
+
+  /**
+   * @brief Returns the name of the given column from the given source.
+   */
+  auto column_name(const int source_idx, const int column_id) const
+  {
+    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
+                 "Out of range source_idx provided");
+    return per_file_metadata[source_idx].column_name(column_id);
+  }
+
+  /**
+   * @brief Returns the full name of the given column from the given source.
+   *
+   * Full name includes ancestor columns' names.
+   */
+  auto column_path(const int source_idx, const int column_id) const
+  {
+    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
+                 "Out of range source_idx provided");
+    return per_file_metadata[source_idx].column_path(column_id);
+  }
+
+  /**
+   * @brief Selects the stripes to read, based on the row/stripe selection parameters.
+   *
+   * Stripes are potentially selected from multiple files.
+   */
+  std::vector<metadata::stripe_source_mapping> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    size_type& row_start,
+    size_type& row_count);
+
+  /**
+   * @brief Filters ORC file to a selection of columns, based on their paths in the file.
+   *
+   * Paths are in format "grandparent_col.parent_col.child_col", where the root ORC column is
+   * ommited to match the cuDF table hierarchy.
+   *
+   * @param column_paths List of full column names (i.e. paths) to select from the ORC file
+   * @return Columns hierarchy - lists of children columns and sorted columns in each nesting level
+   */
+  column_hierarchy select_columns(std::vector<std::string> const& column_paths);
+};
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index b275496c705..89eac0c9901 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -15,10 +15,13 @@
  */
 
 #include "orc.h"
-#include <string>
 #include "orc_field_reader.hpp"
 #include "orc_field_writer.hpp"
 
+#include <thrust/tabulate.h>
+
+#include <string>
+
 namespace cudf {
 namespace io {
 namespace orc {
@@ -459,48 +462,46 @@ metadata::metadata(datasource* const src) : source(src)
   auto md_data     = decompressor->Decompress(buffer->data(), ps.metadataLength, &md_length);
   orc::ProtobufReader(md_data, md_length).read(md);
 
-  // Initialize the column names
+  init_parent_descriptors();
   init_column_names();
 }
 
-void metadata::init_column_names() const
-{
-  auto const schema_idxs = get_schema_indexes();
-  auto const& types      = ff.types;
-  for (int32_t col_id = 0; col_id < get_num_columns(); ++col_id) {
-    std::string col_name;
-    if (schema_idxs[col_id].parent >= 0 and schema_idxs[col_id].field >= 0) {
-      auto const parent_idx = static_cast<uint32_t>(schema_idxs[col_id].parent);
-      auto const field_idx  = static_cast<uint32_t>(schema_idxs[col_id].field);
-      if (field_idx < types[parent_idx].fieldNames.size()) {
-        col_name = types[parent_idx].fieldNames[field_idx];
-      }
-    }
-    // If we have no name (root column), generate a name
-    column_names.push_back(col_name.empty() ? "col" + std::to_string(col_id) : col_name);
-  }
-}
-
-std::vector<metadata::schema_indexes> metadata::get_schema_indexes() const
-{
-  std::vector<schema_indexes> result(ff.types.size());
-
-  auto const schema_size = static_cast<uint32_t>(result.size());
-  for (uint32_t i = 0; i < schema_size; i++) {
-    auto const& subtypes    = ff.types[i].subtypes;
-    auto const num_children = static_cast<uint32_t>(subtypes.size());
-    if (result[i].parent == -1) {  // Not initialized
-      result[i].parent = i;        // set root node as its own parent
-    }
-    for (uint32_t j = 0; j < num_children; j++) {
-      auto const column_id = subtypes[j];
-      CUDF_EXPECTS(column_id > i && column_id < schema_size, "Invalid column id");
-      CUDF_EXPECTS(result[column_id].parent == -1, "Same node referenced twice");
-      result[column_id].parent = i;
-      result[column_id].field  = j;
+void metadata::init_column_names()
+{
+  column_names.resize(get_num_columns());
+  thrust::tabulate(column_names.begin(), column_names.end(), [&](auto col_id) {
+    if (not column_has_parent(col_id)) return std::string{};
+    auto const& parent_field_names = ff.types[parent_id(col_id)].fieldNames;
+    // Child columns of lists don't have a name in ORC files, generate placeholder in that case
+    return field_index(col_id) < static_cast<size_type>(parent_field_names.size())
+             ? parent_field_names[field_index(col_id)]
+             : std::to_string(col_id);
+  });
+
+  column_paths.resize(get_num_columns());
+  thrust::tabulate(column_paths.begin(), column_paths.end(), [&](auto col_id) {
+    if (not column_has_parent(col_id)) return std::string{};
+    // Don't include ORC root column name in path
+    return (parent_id(col_id) == 0 ? "" : column_paths[parent_id(col_id)] + ".") +
+           column_names[col_id];
+  });
+}
+
+void metadata::init_parent_descriptors()
+{
+  auto const num_columns = static_cast<size_type>(ff.types.size());
+  parents.resize(num_columns);
+
+  for (size_type col_id = 0; col_id < num_columns; ++col_id) {
+    auto const& subtypes    = ff.types[col_id].subtypes;
+    auto const num_children = static_cast<size_type>(subtypes.size());
+    for (size_type field_idx = 0; field_idx < num_children; ++field_idx) {
+      auto const child_id = static_cast<size_type>(subtypes[field_idx]);
+      CUDF_EXPECTS(child_id > col_id && child_id < num_columns, "Invalid column id");
+      CUDF_EXPECTS(not column_has_parent(child_id), "Same node referenced twice");
+      parents[child_id] = {col_id, field_idx};
     }
   }
-  return result;
 }
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/orc.h b/cpp/src/io/orc/orc.h
index 405bf7c2ecc..d75b76a0341 100644
--- a/cpp/src/io/orc/orc.h
+++ b/cpp/src/io/orc/orc.h
@@ -555,8 +555,8 @@ class OrcDecompressor {
  *
  */
 struct orc_column_meta {
-  uint32_t id;            // orc id for the column
-  uint32_t num_children;  // number of children at the same level of nesting in case of struct
+  size_type id;            // orc id for the column
+  size_type num_children;  // number of children at the same level of nesting in case of struct
 };
 
 /**
@@ -586,13 +586,51 @@ class metadata {
   size_t get_total_rows() const { return ff.numberOfRows; }
   int get_num_stripes() const { return ff.stripes.size(); }
   int get_num_columns() const { return ff.types.size(); }
-  std::string const& get_column_name(int32_t column_id) const
+  /**
+   * @brief Returns the name of the column with the given ID.
+   *
+   * Name might not be unique in the ORC file, since columns with different parents are allowed to
+   * have the same names.
+   */
+  std::string const& column_name(size_type column_id) const
   {
-    if (column_names.empty() && get_num_columns() != 0) { init_column_names(); }
+    CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
     return column_names[column_id];
   }
+  /**
+   * @brief Returns the full name of the column with the given ID - includes the ancestor columns
+   * names.
+   *
+   * Each column in the ORC file has a unique path.
+   */
+  std::string const& column_path(size_type column_id) const
+  {
+    CUDF_EXPECTS(column_id < get_num_columns(), "Out of range column id provided");
+    return column_paths[column_id];
+  }
   int get_row_index_stride() const { return ff.rowIndexStride; }
 
+  /**
+   * @brief Returns the ID of the parent column of the given column.
+   */
+  size_type parent_id(size_type column_id) const { return parents.at(column_id).value().id; }
+
+  /**
+   * @brief Returns the index the given column has in its parent's children list.
+   */
+  size_type field_index(size_type column_id) const
+  {
+    return parents.at(column_id).value().field_idx;
+  }
+
+  /**
+   * @brief Returns whether the given column has a parent.
+   */
+  size_type column_has_parent(size_type column_id) const
+  {
+    return parents.at(column_id).has_value();
+  }
+
  public:
   PostScript ps;
   FileFooter ff;
@@ -602,14 +640,19 @@ class metadata {
   datasource* const source;
 
  private:
-  struct schema_indexes {
-    int32_t parent = -1;
-    int32_t field  = -1;
+  struct column_parent {
+    // parent's ID
+    size_type id;
+    // Index of this column in the parent's list of children
+    size_type field_idx;
+    column_parent(size_type parent_id, size_type field_idx) : id{parent_id}, field_idx{field_idx} {}
   };
-  std::vector<schema_indexes> get_schema_indexes() const;
-  void init_column_names() const;
+  void init_parent_descriptors();
+  std::vector<std::optional<column_parent>> parents;
 
-  mutable std::vector<std::string> column_names;
+  void init_column_names();
+  std::vector<std::string> column_names;
+  std::vector<std::string> column_paths;
 };
 
 /**
diff --git a/cpp/src/io/orc/orc_common.h b/cpp/src/io/orc/orc_common.h
index eedaa9d4fc2..f88a84b0bfc 100644
--- a/cpp/src/io/orc/orc_common.h
+++ b/cpp/src/io/orc/orc_common.h
@@ -22,9 +22,7 @@ namespace cudf {
 namespace io {
 namespace orc {
 
-// ORC rows are divided into groups and assigned indexes for faster seeking
-static constexpr uint32_t default_row_index_stride = 10000;
-static constexpr uint32_t BLOCK_HEADER_SIZE        = 3;
+static constexpr uint32_t BLOCK_HEADER_SIZE = 3;
 
 enum CompressionKind : uint8_t {
   NONE   = 0,
diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
index 389895abc83..f6a7c3f5f03 100644
--- a/cpp/src/io/orc/orc_gpu.h
+++ b/cpp/src/io/orc/orc_gpu.h
@@ -110,9 +110,9 @@ struct ColumnDesc {
   uint32_t rowgroup_id;                    // row group position
   ColumnEncodingKind encoding_kind;        // column encoding kind
   TypeKind type_kind;                      // column data type
-  uint8_t dtype_len;      // data type length (for types that can be mapped to different sizes)
-  int32_t decimal_scale;  // number of fractional decimal digits for decimal type
-  int32_t ts_clock_rate;  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
+  uint8_t dtype_len;          // data type length (for types that can be mapped to different sizes)
+  int32_t decimal_scale;      // number of fractional decimal digits for decimal type
+  type_id timestamp_type_id;  // output timestamp type id (type_id::EMPTY by default)
   column_validity_info parent_validity_info;  // consists of parent column valid_map and null count
   uint32_t* parent_null_count_prefix_sums;  // per-stripe prefix sums of parent column's null count
 };
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index d05bec92166..31d4184993f 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -19,13 +19,14 @@
  * @brief cuDF-IO ORC reader class implementation
  */
 
-#include "io/orc/orc_gpu.h"
+#include "orc.h"
+#include "orc_gpu.h"
 #include "reader_impl.hpp"
 #include "timezone.cuh"
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
-#include "orc.h"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
@@ -33,7 +34,6 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <iterator>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
@@ -42,15 +42,13 @@
 #include <nvcomp/snappy.h>
 
 #include <algorithm>
-#include <array>
+#include <iterator>
 
 namespace cudf {
 namespace io {
 namespace detail {
 namespace orc {
-// Import functionality that's independent of legacy code
 using namespace cudf::io::orc;
-using namespace cudf::io;
 
 namespace {
 /**
@@ -116,9 +114,6 @@ constexpr std::pair<gpu::StreamIndexType, uint32_t> get_index_type_and_pos(
   }
 }
 
-}  // namespace
-
-namespace {
 /**
  * @brief struct to store buffer data and size of list buffer
  */
@@ -241,303 +236,11 @@ bool should_convert_decimal_column_to_float(const std::vector<std::string>& colu
 {
   return (std::find(columns_to_convert.begin(),
                     columns_to_convert.end(),
-                    metadata.get_column_name(column_index)) != columns_to_convert.end());
+                    metadata.column_name(column_index)) != columns_to_convert.end());
 }
 
 }  // namespace
 
-/**
- * @brief In order to support multiple input files/buffers we need to gather
- * the metadata across all of those input(s). This class provides a place
- * to aggregate that metadata from all the files.
- */
-class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<const StripeInformation*, const StripeFooter*>;
-
- public:
-  mutable std::vector<cudf::io::orc::metadata> per_file_metadata;
-  size_type const num_rows;
-  size_type const num_columns;
-  size_type const num_stripes;
-  bool row_grp_idx_present = true;
-
-  /**
-   * @brief Create a metadata object from each element in the source vector
-   */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
-  {
-    std::vector<cudf::io::orc::metadata> metadatas;
-    std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
-        return cudf::io::orc::metadata(source.get());
-      });
-    return metadatas;
-  }
-
-  /**
-   * @brief Sums up the number of rows of each source
-   */
-  size_type calc_num_rows() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.get_total_rows();
-      });
-  }
-
-  /**
-   * @brief Number of columns in a ORC file.
-   */
-  size_type calc_num_cols() const
-  {
-    if (not per_file_metadata.empty()) { return per_file_metadata[0].get_num_columns(); }
-    return 0;
-  }
-
-  /**
-   * @brief Sums up the number of stripes of each source
-   */
-  size_type calc_num_stripes() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.get_num_stripes();
-      });
-  }
-
-  aggregate_orc_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
-    : per_file_metadata(metadatas_from_sources(sources)),
-      num_rows(calc_num_rows()),
-      num_columns(calc_num_cols()),
-      num_stripes(calc_num_stripes())
-  {
-    // Verify that the input files have the same number of columns,
-    // as well as matching types, compression, and names
-    for (auto const& pfm : per_file_metadata) {
-      CUDF_EXPECTS(per_file_metadata[0].get_num_columns() == pfm.get_num_columns(),
-                   "All sources must have the same number of columns");
-      CUDF_EXPECTS(per_file_metadata[0].ps.compression == pfm.ps.compression,
-                   "All sources must have the same compression type");
-
-      // Check the types, column names, and decimal scale
-      for (size_t i = 0; i < pfm.ff.types.size(); i++) {
-        CUDF_EXPECTS(pfm.ff.types[i].kind == per_file_metadata[0].ff.types[i].kind,
-                     "Column types across all input sources must be the same");
-        CUDF_EXPECTS(std::equal(pfm.ff.types[i].fieldNames.begin(),
-                                pfm.ff.types[i].fieldNames.end(),
-                                per_file_metadata[0].ff.types[i].fieldNames.begin()),
-                     "All source column names must be the same");
-        CUDF_EXPECTS(
-          pfm.ff.types[i].scale.value_or(0) == per_file_metadata[0].ff.types[i].scale.value_or(0),
-          "All scale values must be the same");
-      }
-    }
-  }
-
-  auto const& get_schema(int schema_idx) const { return per_file_metadata[0].ff.types[schema_idx]; }
-
-  auto get_col_type(int col_idx) const { return per_file_metadata[0].ff.types[col_idx]; }
-
-  auto get_num_rows() const { return num_rows; }
-
-  auto get_num_cols() const { return per_file_metadata[0].get_num_columns(); }
-
-  auto get_num_stripes() const { return num_stripes; }
-
-  auto get_num_source_files() const { return per_file_metadata.size(); }
-
-  auto const& get_types() const { return per_file_metadata[0].ff.types; }
-
-  int get_row_index_stride() const { return per_file_metadata[0].ff.rowIndexStride; }
-
-  auto get_column_name(const int source_idx, const int column_idx) const
-  {
-    CUDF_EXPECTS(source_idx <= static_cast<int>(per_file_metadata.size()),
-                 "Out of range source_idx provided");
-    CUDF_EXPECTS(column_idx <= per_file_metadata[source_idx].get_num_columns(),
-                 "Out of range column_idx provided");
-    return per_file_metadata[source_idx].get_column_name(column_idx);
-  }
-
-  auto is_row_grp_idx_present() const { return row_grp_idx_present; }
-
-  std::vector<cudf::io::orc::metadata::stripe_source_mapping> select_stripes(
-    std::vector<std::vector<size_type>> const& user_specified_stripes,
-    size_type& row_start,
-    size_type& row_count)
-  {
-    std::vector<cudf::io::orc::metadata::stripe_source_mapping> selected_stripes_mapping;
-
-    if (!user_specified_stripes.empty()) {
-      CUDF_EXPECTS(user_specified_stripes.size() == get_num_source_files(),
-                   "Must specify stripes for each source");
-      // row_start is 0 if stripes are set. If this is not true anymore, then
-      // row_start needs to be subtracted to get the correct row_count
-      CUDF_EXPECTS(row_start == 0, "Start row index should be 0");
-
-      row_count = 0;
-      // Each vector entry represents a source file; each nested vector represents the
-      // user_defined_stripes to get from that source file
-      for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-        std::vector<OrcStripeInfo> stripe_infos;
-
-        // Coalesce stripe info at the source file later since that makes downstream processing much
-        // easier in impl::read
-        for (const size_t& stripe_idx : user_specified_stripes[src_file_idx]) {
-          CUDF_EXPECTS(stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size(),
-                       "Invalid stripe index");
-          stripe_infos.push_back(
-            std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-          row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
-        }
-        selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
-      }
-    } else {
-      row_start = std::max(row_start, 0);
-      if (row_count < 0) {
-        row_count = static_cast<size_type>(
-          std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
-      }
-      row_count = std::min(row_count, get_num_rows() - row_start);
-      CUDF_EXPECTS(row_count >= 0, "Invalid row count");
-      CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
-
-      size_type count            = 0;
-      size_type stripe_skip_rows = 0;
-      // Iterate all source files, each source file has corelating metadata
-      for (size_t src_file_idx = 0;
-           src_file_idx < per_file_metadata.size() && count < row_start + row_count;
-           ++src_file_idx) {
-        std::vector<OrcStripeInfo> stripe_infos;
-
-        for (size_t stripe_idx = 0;
-             stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
-             count < row_start + row_count;
-             ++stripe_idx) {
-          count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
-          if (count > row_start || count == 0) {
-            stripe_infos.push_back(
-              std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-          } else {
-            stripe_skip_rows = count;
-          }
-        }
-
-        selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
-      }
-      // Need to remove skipped rows from the stripes which are not selected.
-      row_start -= stripe_skip_rows;
-    }
-
-    // Read each stripe's stripefooter metadata
-    if (not selected_stripes_mapping.empty()) {
-      for (auto& mapping : selected_stripes_mapping) {
-        // Resize to all stripe_info for the source level
-        per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
-
-        for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-          const auto stripe         = mapping.stripe_info[i].first;
-          const auto sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
-          const auto sf_comp_length = stripe->footerLength;
-          CUDF_EXPECTS(
-            sf_comp_offset + sf_comp_length < per_file_metadata[mapping.source_idx].source->size(),
-            "Invalid stripe information");
-          const auto buffer =
-            per_file_metadata[mapping.source_idx].source->host_read(sf_comp_offset, sf_comp_length);
-          size_t sf_length = 0;
-          auto sf_data     = per_file_metadata[mapping.source_idx].decompressor->Decompress(
-            buffer->data(), sf_comp_length, &sf_length);
-          ProtobufReader(sf_data, sf_length)
-            .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-          mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
-          if (stripe->indexLength == 0) { row_grp_idx_present = false; }
-        }
-      }
-    }
-
-    return selected_stripes_mapping;
-  }
-
-  /**
-   * @brief Adds column as per the request and saves metadata about children.
-   *        Children of a column will be added to the next level.
-   *
-   * @param selection A vector that saves list of columns as per levels of nesting.
-   * @param types A vector of schema types of columns.
-   * @param level current level of nesting.
-   * @param id current column id that needs to be added.
-   * @param has_timestamp_column True if timestamp column present and false otherwise.
-   * @param has_nested_column True if any of the selected column is a nested type.
-   */
-  void add_column(std::vector<std::vector<orc_column_meta>>& selection,
-                  std::vector<SchemaType> const& types,
-                  const size_t level,
-                  const uint32_t id,
-                  bool& has_timestamp_column,
-                  bool& has_nested_column)
-  {
-    if (level == selection.size()) { selection.emplace_back(); }
-    selection[level].push_back({id, 0});
-    const int col_id = selection[level].size() - 1;
-    if (types[id].kind == orc::TIMESTAMP) { has_timestamp_column = true; }
-
-    if (types[id].kind == orc::MAP or types[id].kind == orc::LIST or
-        types[id].kind == orc::STRUCT) {
-      has_nested_column = true;
-      for (const auto child_id : types[id].subtypes) {
-        // Since nested column needs to be processed before its child can be processed,
-        // child column is being added to next level
-        add_column(selection, types, level + 1, child_id, has_timestamp_column, has_nested_column);
-      }
-      selection[level][col_id].num_children = types[id].subtypes.size();
-    }
-  }
-
-  /**
-   * @brief Filters and reduces down to a selection of columns
-   *
-   * @param use_names List of column names to select
-   * @param has_timestamp_column True if timestamp column present and false otherwise
-   * @param has_nested_column True if any of the selected column is a nested type.
-   *
-   * @return Vector of list of ORC column meta-data
-   */
-  std::vector<std::vector<orc_column_meta>> select_columns(
-    std::vector<std::string> const& use_names, bool& has_timestamp_column, bool& has_nested_column)
-  {
-    auto const& pfm = per_file_metadata[0];
-    std::vector<std::vector<orc_column_meta>> selection;
-
-    if (not use_names.empty()) {
-      uint32_t index = 0;
-      // Have to check only parent columns
-      auto const num_columns = pfm.ff.types[0].subtypes.size();
-
-      for (const auto& use_name : use_names) {
-        bool name_found = false;
-        for (uint32_t i = 0; i < num_columns; ++i, ++index) {
-          if (index >= num_columns) { index = 0; }
-          auto col_id = pfm.ff.types[0].subtypes[index];
-          if (pfm.get_column_name(col_id) == use_name) {
-            name_found = true;
-            add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_nested_column);
-            // Should start with next index
-            index = i + 1;
-            break;
-          }
-        }
-        CUDF_EXPECTS(name_found, "Unknown column name : " + std::string(use_name));
-      }
-    } else {
-      for (auto const& col_id : pfm.ff.types[0].subtypes) {
-        add_column(selection, pfm.ff.types, 0, col_id, has_timestamp_column, has_nested_column);
-      }
-    }
-
-    return selection;
-  }
-};
-
 void snappy_decompress(device_span<gpu_inflate_input_s> comp_in,
                        device_span<gpu_inflate_status_s> comp_stat,
                        size_t max_uncomp_page_size,
@@ -670,15 +373,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
 
   // Dispatch batches of blocks to decompress
   if (num_compressed_blocks > 0) {
-    auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
-    bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
     switch (decompressor->GetKind()) {
       case orc::ZLIB:
         CUDA_TRY(
           gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream));
         break;
       case orc::SNAPPY:
-        if (use_nvcomp) {
+        if (nvcomp_integration::is_stable_enabled()) {
           device_span<gpu_inflate_input_s> inflate_in_view{inflate_in.data(),
                                                            num_compressed_blocks};
           device_span<gpu_inflate_status_s> inflate_out_view{inflate_out.data(),
@@ -933,18 +634,18 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
                                         cudf::detail::host_2dspan<gpu::RowGroup> row_groups,
                                         std::vector<column_buffer>& out_buffers,
                                         std::vector<orc_column_meta> const& list_col,
-                                        const int32_t level)
+                                        const size_type level)
 {
   const auto num_of_stripes         = chunks.size().first;
   const auto num_of_rowgroups       = row_groups.size().first;
-  const auto num_parent_cols        = _selected_columns[level].size();
-  const auto num_child_cols         = _selected_columns[level + 1].size();
+  const auto num_parent_cols        = selected_columns.levels[level].size();
+  const auto num_child_cols         = selected_columns.levels[level + 1].size();
   const auto number_of_child_chunks = num_child_cols * num_of_stripes;
   auto& num_child_rows              = _col_meta.num_child_rows;
   auto& parent_column_data          = _col_meta.parent_column_data;
 
   // Reset the meta to store child column details.
-  num_child_rows.resize(_selected_columns[level + 1].size());
+  num_child_rows.resize(selected_columns.levels[level + 1].size());
   std::fill(num_child_rows.begin(), num_child_rows.end(), 0);
   parent_column_data.resize(number_of_child_chunks);
   _col_meta.parent_column_index.resize(number_of_child_chunks);
@@ -976,7 +677,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
         for (size_t rowgroup_id = 0; rowgroup_id < stripe_num_row_groups;
              rowgroup_id++, processed_row_groups++) {
           const auto child_rows = row_groups[processed_row_groups][parent_col_idx].num_child_rows;
-          for (uint32_t id = 0; id < p_col.num_children; id++) {
+          for (size_type id = 0; id < p_col.num_children; id++) {
             const auto child_col_idx                                  = index + id;
             rwgrp_meta[processed_row_groups][child_col_idx].start_row = processed_child_rows;
             rwgrp_meta[processed_row_groups][child_col_idx].num_rows  = child_rows;
@@ -987,7 +688,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       const auto child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
-      for (uint32_t id = 0; id < p_col.num_children; id++) {
+      for (size_type id = 0; id < p_col.num_children; id++) {
         const auto child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
@@ -1005,7 +706,7 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
     auto parent_valid_map  = out_buffers[parent_col_idx].null_mask();
     auto num_rows          = out_buffers[parent_col_idx].size;
 
-    for (uint32_t id = 0; id < p_col.num_children; id++) {
+    for (size_type id = 0; id < p_col.num_children; id++) {
       const auto child_col_idx                     = index + id;
       _col_meta.parent_column_index[child_col_idx] = parent_col_idx;
       if (type == type_id::STRUCT) {
@@ -1022,21 +723,21 @@ void reader::impl::aggregate_child_meta(cudf::detail::host_2dspan<gpu::ColumnDes
 
 std::string get_map_child_col_name(size_t const idx) { return (idx == 0) ? "key" : "value"; }
 
-std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_id,
+std::unique_ptr<column> reader::impl::create_empty_column(const size_type orc_col_id,
                                                           column_name_info& schema_info,
                                                           rmm::cuda_stream_view stream)
 {
-  schema_info.name = _metadata->get_column_name(0, orc_col_id);
+  schema_info.name = _metadata.column_name(0, orc_col_id);
   // If the column type is orc::DECIMAL see if the user
   // desires it to be converted to float64 or not
   auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-    _decimal_cols_as_float, _metadata->per_file_metadata[0], orc_col_id);
+    _decimal_cols_as_float, _metadata.per_file_metadata[0], orc_col_id);
   auto const type = to_type_id(
-    _metadata->get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+    _metadata.get_schema(orc_col_id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
   int32_t scale = 0;
   std::vector<std::unique_ptr<column>> child_columns;
   std::unique_ptr<column> out_col = nullptr;
-  auto kind                       = _metadata->get_col_type(orc_col_id).kind;
+  auto kind                       = _metadata.get_col_type(orc_col_id).kind;
 
   switch (kind) {
     case orc::LIST:
@@ -1044,9 +745,9 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
       schema_info.children.emplace_back("");
       out_col = make_lists_column(
         0,
-        make_empty_column(data_type(type_id::INT32)),
+        make_empty_column(type_id::INT32),
         create_empty_column(
-          _metadata->get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
+          _metadata.get_col_type(orc_col_id).subtypes[0], schema_info.children.back(), stream),
         0,
         rmm::device_buffer{0, stream},
         stream);
@@ -1054,8 +755,8 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
     case orc::MAP: {
       schema_info.children.emplace_back("offsets");
       schema_info.children.emplace_back("struct");
-      const auto child_column_ids = _metadata->get_col_type(orc_col_id).subtypes;
-      for (size_t idx = 0; idx < _metadata->get_col_type(orc_col_id).subtypes.size(); idx++) {
+      const auto child_column_ids = _metadata.get_col_type(orc_col_id).subtypes;
+      for (size_t idx = 0; idx < _metadata.get_col_type(orc_col_id).subtypes.size(); idx++) {
         auto& children_schema = schema_info.children.back().children;
         children_schema.emplace_back("");
         child_columns.push_back(create_empty_column(
@@ -1066,7 +767,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
       auto struct_col =
         make_structs_column(0, std::move(child_columns), 0, rmm::device_buffer{0, stream}, stream);
       out_col = make_lists_column(0,
-                                  make_empty_column(data_type(type_id::INT32)),
+                                  make_empty_column(type_id::INT32),
                                   std::move(struct_col),
                                   0,
                                   rmm::device_buffer{0, stream},
@@ -1074,7 +775,7 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
     } break;
 
     case orc::STRUCT:
-      for (const auto col : _metadata->get_col_type(orc_col_id).subtypes) {
+      for (const auto col : _metadata.get_col_type(orc_col_id).subtypes) {
         schema_info.children.emplace_back("");
         child_columns.push_back(create_empty_column(col, schema_info.children.back(), stream));
       }
@@ -1084,19 +785,19 @@ std::unique_ptr<column> reader::impl::create_empty_column(const int32_t orc_col_
 
     case orc::DECIMAL:
       if (type == type_id::DECIMAL64) {
-        scale = -static_cast<int32_t>(_metadata->get_types()[orc_col_id].scale.value_or(0));
+        scale = -static_cast<int32_t>(_metadata.get_types()[orc_col_id].scale.value_or(0));
       }
       out_col = make_empty_column(data_type(type, scale));
       break;
 
-    default: out_col = make_empty_column(data_type(type));
+    default: out_col = make_empty_column(type);
   }
 
   return out_col;
 }
 
 // Adds child column buffers to parent column
-column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
+column_buffer&& reader::impl::assemble_buffer(const size_type orc_col_id,
                                               std::vector<std::vector<column_buffer>>& col_buffers,
                                               const size_t level,
                                               rmm::cuda_stream_view stream)
@@ -1104,12 +805,12 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
   auto const col_id = _col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
 
-  col_buffer.name = _metadata->get_column_name(0, orc_col_id);
-  auto kind       = _metadata->get_col_type(orc_col_id).kind;
+  col_buffer.name = _metadata.column_name(0, orc_col_id);
+  auto kind       = _metadata.get_col_type(orc_col_id).kind;
   switch (kind) {
     case orc::LIST:
     case orc::STRUCT:
-      for (auto const& col : _metadata->get_col_type(orc_col_id).subtypes) {
+      for (auto const& col : selected_columns.children[orc_col_id]) {
         col_buffer.children.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
       }
 
@@ -1117,9 +818,9 @@ column_buffer&& reader::impl::assemble_buffer(const int32_t orc_col_id,
     case orc::MAP: {
       std::vector<column_buffer> child_col_buffers;
       // Get child buffers
-      for (size_t idx = 0; idx < _metadata->get_col_type(orc_col_id).subtypes.size(); idx++) {
+      for (size_t idx = 0; idx < selected_columns.children[orc_col_id].size(); idx++) {
         auto name = get_map_child_col_name(idx);
-        auto col  = _metadata->get_col_type(orc_col_id).subtypes[idx];
+        auto col  = selected_columns.children[orc_col_id][idx];
         child_col_buffers.emplace_back(assemble_buffer(col, col_buffers, level + 1, stream));
         child_col_buffers.back().name = name;
       }
@@ -1145,8 +846,8 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
                                   std::vector<column_name_info>& schema_info,
                                   rmm::cuda_stream_view stream)
 {
-  std::transform(_selected_columns[0].begin(),
-                 _selected_columns[0].end(),
+  std::transform(selected_columns.levels[0].begin(),
+                 selected_columns.levels[0].end(),
                  std::back_inserter(out_columns),
                  [&](auto const col_meta) {
                    schema_info.emplace_back("");
@@ -1158,15 +859,11 @@ void reader::impl::create_columns(std::vector<std::vector<column_buffer>>&& col_
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::mr::device_memory_resource* mr)
-  : _mr(mr), _sources(std::move(sources))
+  : _mr(mr),
+    _sources(std::move(sources)),
+    _metadata{_sources},
+    selected_columns{_metadata.select_columns(options.get_columns())}
 {
-  // Open and parse the source(s) dataset metadata
-  _metadata = std::make_unique<aggregate_orc_metadata>(_sources);
-
-  // Select only columns required by the options
-  _selected_columns =
-    _metadata->select_columns(options.get_columns(), _has_timestamp_column, _has_nested_column);
-
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
     _timestamp_type = options.get_timestamp_type();
@@ -1182,56 +879,80 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
   _decimal_cols_as_float = options.get_decimal_cols_as_float();
 }
 
+timezone_table reader::impl::compute_timezone_table(
+  const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
+  rmm::cuda_stream_view stream)
+{
+  if (selected_stripes.empty()) return {};
+
+  auto const has_timestamp_column = std::any_of(
+    selected_columns.levels.cbegin(), selected_columns.levels.cend(), [&](auto& col_lvl) {
+      return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto& col_meta) {
+        return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+      });
+    });
+  if (not has_timestamp_column) return {};
+
+  return build_timezone_transition_table(selected_stripes[0].stripe_info[0].second->writerTimezone,
+                                         stream);
+}
+
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
                                        const std::vector<std::vector<size_type>>& stripes,
                                        rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(skip_rows == 0 or (not _has_nested_column),
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns
+  CUDF_EXPECTS(skip_rows == 0 or selected_columns.num_levels() == 1,
                "skip_rows is not supported by nested columns");
 
   std::vector<std::unique_ptr<column>> out_columns;
   // buffer and stripe data are stored as per nesting level
-  std::vector<std::vector<column_buffer>> out_buffers(_selected_columns.size());
+  std::vector<std::vector<column_buffer>> out_buffers(selected_columns.num_levels());
   std::vector<column_name_info> schema_info;
-  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(_selected_columns.size());
+  std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data(selected_columns.num_levels());
   std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
   table_metadata out_metadata;
 
   // There are no columns in the table
-  if (_selected_columns.size() == 0) return {std::make_unique<table>(), std::move(out_metadata)};
+  if (selected_columns.num_levels() == 0)
+    return {std::make_unique<table>(), std::move(out_metadata)};
 
   // Select only stripes required (aka row groups)
-  const auto selected_stripes = _metadata->select_stripes(stripes, skip_rows, num_rows);
+  const auto selected_stripes = _metadata.select_stripes(stripes, skip_rows, num_rows);
+
+  auto const tz_table = compute_timezone_table(selected_stripes, stream);
 
   // Iterates through levels of nested columns, child column will be one level down
   // compared to parent column.
-  for (size_t level = 0; level < _selected_columns.size(); level++) {
-    auto& selected_columns = _selected_columns[level];
+  for (size_t level = 0; level < selected_columns.num_levels(); level++) {
+    auto& columns_level = selected_columns.levels[level];
     // Association between each ORC column and its cudf::column
-    _col_meta.orc_col_map.emplace_back(_metadata->get_num_cols(), -1);
+    _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
     bool is_data_empty = false;
 
     // Get a list of column data types
     std::vector<data_type> column_types;
-    for (auto& col : selected_columns) {
+    for (auto& col : columns_level) {
       // If the column type is orc::DECIMAL see if the user
       // desires it to be converted to float64 or not
       auto const decimal_as_float64 = should_convert_decimal_column_to_float(
-        _decimal_cols_as_float, _metadata->per_file_metadata[0], col.id);
+        _decimal_cols_as_float, _metadata.per_file_metadata[0], col.id);
       auto col_type = to_type_id(
-        _metadata->get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
+        _metadata.get_col_type(col.id), _use_np_dtypes, _timestamp_type.id(), decimal_as_float64);
       CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
       // Remove this once we support Decimal128 data type
       CUDF_EXPECTS(
-        (col_type != type_id::DECIMAL64) or (_metadata->get_col_type(col.id).precision <= 18),
+        (col_type != type_id::DECIMAL64) or (_metadata.get_col_type(col.id).precision <= 18),
         "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
       if (col_type == type_id::DECIMAL64) {
         // sign of the scale is changed since cuDF follows c++ libraries like CNL
         // which uses negative scaling, but liborc and other libraries
         // follow positive scaling.
-        auto const scale = -static_cast<int32_t>(_metadata->get_col_type(col.id).scale.value_or(0));
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
         column_types.emplace_back(col_type, scale);
       } else {
         column_types.emplace_back(col_type);
@@ -1245,8 +966,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
     // If no rows or stripes to read, return empty columns
     if (num_rows <= 0 || selected_stripes.empty()) {
-      std::transform(_selected_columns[0].begin(),
-                     _selected_columns[0].end(),
+      std::transform(selected_columns.levels[0].begin(),
+                     selected_columns.levels[0].end(),
                      std::back_inserter(out_columns),
                      [&](auto const col_meta) {
                        schema_info.emplace_back("");
@@ -1262,7 +983,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                         [](size_t sum, auto& stripe_source_mapping) {
                           return sum + stripe_source_mapping.stripe_info.size();
                         });
-      const auto num_columns = selected_columns.size();
+      const auto num_columns = columns_level.size();
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
         total_num_stripes, num_columns, stream);
       memset(chunks.base_host_ptr(), 0, chunks.memory_size());
@@ -1270,11 +991,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       const bool use_index =
         (_use_index == true) &&
         // Do stripes have row group index
-        _metadata->is_row_grp_idx_present() &&
+        _metadata.is_row_grp_idx_present() &&
         // Only use if we don't have much work with complete columns & stripes
         // TODO: Consider nrows, gpu, and tune the threshold
-        (num_rows > _metadata->get_row_index_stride() && !(_metadata->get_row_index_stride() & 7) &&
-         _metadata->get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
+        (num_rows > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
+         _metadata.get_row_index_stride() > 0 && num_columns * total_num_stripes < 8 * 128) &&
         // Only use if first row is aligned to a stripe boundary
         // TODO: Fix logic to handle unaligned rows
         (skip_rows == 0);
@@ -1283,12 +1004,13 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       std::vector<orc_stream_info> stream_info;
 
       null_count_prefix_sums.emplace_back();
-      null_count_prefix_sums.back().reserve(_selected_columns[level].size());
-      std::generate_n(
-        std::back_inserter(null_count_prefix_sums.back()), _selected_columns[level].size(), [&]() {
-          return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(total_num_stripes,
-                                                                          stream);
-        });
+      null_count_prefix_sums.back().reserve(selected_columns.levels[level].size());
+      std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
+                      selected_columns.levels[level].size(),
+                      [&]() {
+                        return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+                          total_num_stripes, stream);
+                      });
 
       // Tracker for eventually deallocating compressed and uncompressed data
       auto& stripe_data = lvl_stripe_data[level];
@@ -1310,7 +1032,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stripe_info,
                                                           stripe_footer,
                                                           _col_meta.orc_col_map[level],
-                                                          _metadata->get_types(),
+                                                          _metadata.get_types(),
                                                           use_index,
                                                           &num_dict_entries,
                                                           chunks,
@@ -1342,16 +1064,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
               len += stream_info[stream_count].length;
               stream_count++;
             }
-            if (_metadata->per_file_metadata[stripe_source_mapping.source_idx]
+            if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                   .source->is_device_read_preferred(len)) {
               read_tasks.push_back(
-                std::make_pair(_metadata->per_file_metadata[stripe_source_mapping.source_idx]
+                std::make_pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
                                  .source->device_read_async(offset, len, d_dst, stream),
                                len));
 
             } else {
               const auto buffer =
-                _metadata->per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
+                _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
                   offset, len);
               CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
               CUDA_TRY(cudaMemcpyAsync(
@@ -1364,8 +1086,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           const auto rowgroup_id         = num_rowgroups;
           auto stripe_num_rowgroups      = 0;
           if (use_index) {
-            stripe_num_rowgroups = (num_rows_per_stripe + _metadata->get_row_index_stride() - 1) /
-                                   _metadata->get_row_index_stride();
+            stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
+                                   _metadata.get_row_index_stride();
           }
           // Update chunks to reference streams pointers
           for (size_t col_idx = 0; col_idx < num_columns; col_idx++) {
@@ -1386,19 +1108,17 @@ table_with_metadata reader::impl::read(size_type skip_rows,
               (level == 0)
                 ? nullptr
                 : null_count_prefix_sums[level - 1][_col_meta.parent_column_index[col_idx]].data();
-            chunk.encoding_kind = stripe_footer->columns[selected_columns[col_idx].id].kind;
-            chunk.type_kind     = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                                .ff.types[selected_columns[col_idx].id]
+            chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+            chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                .ff.types[columns_level[col_idx].id]
                                 .kind;
             // num_child_rows for a struct column will be same, for other nested types it will be
             // calculated.
-            chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-            auto const decimal_as_float64 =
-              should_convert_decimal_column_to_float(_decimal_cols_as_float,
-                                                     _metadata->per_file_metadata[0],
-                                                     selected_columns[col_idx].id);
-            chunk.decimal_scale = _metadata->per_file_metadata[stripe_source_mapping.source_idx]
-                                    .ff.types[selected_columns[col_idx].id]
+            chunk.num_child_rows          = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+            auto const decimal_as_float64 = should_convert_decimal_column_to_float(
+              _decimal_cols_as_float, _metadata.per_file_metadata[0], columns_level[col_idx].id);
+            chunk.decimal_scale = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
+                                    .ff.types[columns_level[col_idx].id]
                                     .scale.value_or(0) |
                                   (decimal_as_float64 ? orc::gpu::orc_decimal2float64_scale : 0);
 
@@ -1407,11 +1127,11 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                     ? sizeof(string_index_pair)
                                   : ((column_types[col_idx].id() == type_id::LIST) or
                                  (column_types[col_idx].id() == type_id::STRUCT))
-                                    ? sizeof(int32_t)
+                                    ? sizeof(size_type)
                                     : cudf::size_of(column_types[col_idx]);
             chunk.num_rowgroups = stripe_num_rowgroups;
             if (chunk.type_kind == orc::TIMESTAMP) {
-              chunk.ts_clock_rate = to_clockrate(_timestamp_type.id());
+              chunk.timestamp_type_id = _timestamp_type.id();
             }
             if (not is_data_empty) {
               for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
@@ -1450,15 +1170,15 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          });
         }
         // Setup row group descriptors if using indexes
-        if (_metadata->per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
+        if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
           auto decomp_data =
             decompress_stripe_data(chunks,
                                    stripe_data,
-                                   _metadata->per_file_metadata[0].decompressor.get(),
+                                   _metadata.per_file_metadata[0].decompressor.get(),
                                    stream_info,
                                    total_num_stripes,
                                    row_groups,
-                                   _metadata->get_row_index_stride(),
+                                   _metadata.get_row_index_stride(),
                                    level == 0,
                                    stream);
           stripe_data.clear();
@@ -1473,19 +1193,12 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                     num_columns,
                                     total_num_stripes,
                                     num_rowgroups,
-                                    _metadata->get_row_index_stride(),
+                                    _metadata.get_row_index_stride(),
                                     level == 0,
                                     stream);
           }
         }
 
-        // Setup table for converting timestamp columns from local to UTC time
-        auto const tz_table =
-          _has_timestamp_column
-            ? build_timezone_transition_table(
-                selected_stripes[0].stripe_info[0].second->writerTimezone, stream)
-            : timezone_table{};
-
         for (size_t i = 0; i < column_types.size(); ++i) {
           bool is_nullable = false;
           for (size_t j = 0; j < total_num_stripes; ++j) {
@@ -1507,7 +1220,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                              skip_rows,
                              tz_table.view(),
                              row_groups,
-                             _metadata->get_row_index_stride(),
+                             _metadata.get_row_index_stride(),
                              out_buffers[level],
                              level,
                              stream);
@@ -1556,7 +1269,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
   out_metadata.schema_info = std::move(schema_info);
 
-  for (const auto& meta : _metadata->per_file_metadata) {
+  for (const auto& meta : _metadata.per_file_metadata) {
     for (const auto& kv : meta.ff.metadata) {
       out_metadata.user_data.insert({kv.name, kv.value});
     }
@@ -1583,6 +1296,7 @@ table_with_metadata reader::read(orc_reader_options const& options, rmm::cuda_st
   return _impl->read(
     options.get_skip_rows(), options.get_num_rows(), options.get_stripes(), stream);
 }
+
 }  // namespace orc
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 4be75b6cc2a..c9de2211d48 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "aggregate_orc_metadata.hpp"
 #include "orc.h"
 #include "orc_gpu.h"
 
@@ -38,7 +39,6 @@ namespace io {
 namespace detail {
 namespace orc {
 using namespace cudf::io::orc;
-using namespace cudf::io;
 
 // Forward declarations
 class metadata;
@@ -46,13 +46,12 @@ namespace {
 struct orc_stream_info;
 struct stripe_source_mapping;
 }  // namespace
-class aggregate_orc_metadata;
 
 /**
  * @brief Keeps track of orc mapping and child column details.
  */
 struct reader_column_meta {
-  std::vector<std::vector<int32_t>>
+  std::vector<std::vector<size_type>>
     orc_col_map;                         // Mapping between column id in orc to processing order.
   std::vector<uint32_t> num_child_rows;  // number of rows in child columns
 
@@ -174,7 +173,7 @@ class reader::impl {
    * @param col_buffers Column buffers for columns and children.
    * @param level Current nesting level.
    */
-  column_buffer&& assemble_buffer(const int32_t orc_col_id,
+  column_buffer&& assemble_buffer(const size_type orc_col_id,
                                   std::vector<std::vector<column_buffer>>& col_buffers,
                                   const size_t level,
                                   rmm::cuda_stream_view stream);
@@ -201,21 +200,27 @@ class reader::impl {
    *
    * @return An empty column equivalent to orc column type.
    */
-  std::unique_ptr<column> create_empty_column(const int32_t orc_col_id,
+  std::unique_ptr<column> create_empty_column(const size_type orc_col_id,
                                               column_name_info& schema_info,
                                               rmm::cuda_stream_view stream);
 
+  /**
+   * @brief Setup table for converting timestamp columns from local to UTC time
+   *
+   * @return Timezone table with timestamp offsets
+   */
+  timezone_table compute_timezone_table(
+    const std::vector<cudf::io::orc::metadata::stripe_source_mapping>& selected_stripes,
+    rmm::cuda_stream_view stream);
+
  private:
   rmm::mr::device_memory_resource* _mr = nullptr;
   std::vector<std::unique_ptr<datasource>> _sources;
-  std::unique_ptr<aggregate_orc_metadata> _metadata;
-  // _output_columns associated schema indices
-  std::vector<std::vector<orc_column_meta>> _selected_columns;
-
-  bool _use_index            = true;
-  bool _use_np_dtypes        = true;
-  bool _has_timestamp_column = false;
-  bool _has_nested_column    = false;
+  cudf::io::orc::detail::aggregate_orc_metadata _metadata;
+  cudf::io::orc::detail::column_hierarchy selected_columns;
+
+  bool _use_index     = true;
+  bool _use_np_dtypes = true;
   std::vector<std::string> _decimal_cols_as_float;
   data_type _timestamp_type{type_id::EMPTY};
   reader_column_meta _col_meta;
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index b0cbfc34a21..bcbe77d9df8 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1789,17 +1789,29 @@ __global__ void __launch_bounds__(block_size)
                 seconds += get_gmt_offset(tz_table.ttimes, tz_table.offsets, seconds);
               }
               if (seconds < 0 && nanos != 0) { seconds -= 1; }
-              if (s->chunk.ts_clock_rate) {
-                duration_ns d_ns{nanos};
-                d_ns += duration_s{seconds};
-                static_cast<int64_t*>(data_out)[row] =
-                  d_ns.count() * s->chunk.ts_clock_rate /
-                  duration_ns::period::den;  // Output to desired clock rate
-              } else {
-                cudf::duration_s d{seconds};
-                static_cast<int64_t*>(data_out)[row] =
-                  cuda::std::chrono::duration_cast<cudf::duration_ns>(d).count() + nanos;
-              }
+
+              duration_ns d_ns{nanos};
+              duration_s d_s{seconds};
+
+              static_cast<int64_t*>(data_out)[row] = [&]() {
+                using cuda::std::chrono::duration_cast;
+                switch (s->chunk.timestamp_type_id) {
+                  case type_id::TIMESTAMP_SECONDS:
+                    return d_s.count() + duration_cast<duration_s>(d_ns).count();
+                  case type_id::TIMESTAMP_MILLISECONDS:
+                    return duration_cast<duration_ms>(d_s).count() +
+                           duration_cast<duration_ms>(d_ns).count();
+                  case type_id::TIMESTAMP_MICROSECONDS:
+                    return duration_cast<duration_us>(d_s).count() +
+                           duration_cast<duration_us>(d_ns).count();
+                  case type_id::TIMESTAMP_NANOSECONDS:
+                  default:
+                    return duration_cast<duration_ns>(d_s).count() +
+                           d_ns.count();  // nanoseconds as output in case of `type_id::EMPTY` and
+                                          // `type_id::TIMESTAMP_NANOSECONDS`
+                }
+              }();
+
               break;
             }
           }
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index c8ed0e36966..ff7b642be0e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <io/utilities/block_utils.cuh>
+#include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
 #include <cub/cub.cuh>
@@ -349,13 +350,8 @@ template <StreamIndexType cid,
           uint32_t inmask,
           int block_size,
           typename Storage>
-static __device__ uint32_t IntegerRLE(orcenc_state_s* s,
-                                      const T* inbuf,
-                                      uint32_t inpos,
-                                      uint32_t numvals,
-                                      uint32_t flush,
-                                      int t,
-                                      Storage& temp_storage)
+static __device__ uint32_t IntegerRLE(
+  orcenc_state_s* s, const T* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage)
 {
   using block_reduce = cub::BlockReduce<T, block_size>;
   uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
@@ -766,7 +762,8 @@ __global__ void __launch_bounds__(block_size)
   auto const column = *s->chunk.column;
   while (s->cur_row < s->chunk.num_rows || s->numvals + s->numlengths != 0) {
     // Fetch non-null values
-    if (s->chunk.type_kind != LIST && !s->stream.data_ptrs[CI_DATA]) {
+    auto const length_stream_only = s->chunk.type_kind == LIST or s->chunk.type_kind == MAP;
+    if (not length_stream_only && s->stream.data_ptrs[CI_DATA] == nullptr) {
       // Pass-through
       __syncthreads();
       if (!t) {
@@ -847,7 +844,8 @@ __global__ void __launch_bounds__(block_size)
             // Reusing the lengths array for the scale stream
             // Note: can be written in a faster manner, given that all values are equal
           case DECIMAL: s->lengths.u32[nz_idx] = zigzag(s->chunk.scale); break;
-          case LIST: {
+          case LIST:
+          case MAP: {
             auto const& offsets = column.child(lists_column_view::offsets_column_index);
             // Compute list length from the offsets
             s->lengths.u32[nz_idx] = offsets.element<size_type>(row + 1 + column.offset()) -
@@ -887,7 +885,7 @@ __global__ void __launch_bounds__(block_size)
         s->nnz += nz;
         s->numvals += nz;
         s->numlengths += (s->chunk.type_kind == TIMESTAMP || s->chunk.type_kind == DECIMAL ||
-                          s->chunk.type_kind == LIST ||
+                          s->chunk.type_kind == LIST || s->chunk.type_kind == MAP ||
                           (s->chunk.type_kind == STRING && s->chunk.encoding_kind != DICTIONARY_V2))
                            ? nz
                            : 0;
@@ -902,12 +900,12 @@ __global__ void __launch_bounds__(block_size)
           case INT:
           case DATE:
             n = IntegerRLE<CI_DATA, int32_t, true, 0x3ff, block_size>(
-              s, s->vals.i32, s->nnz - s->numvals, s->numvals, flush, t, temp_storage.i32);
+              s, s->vals.i32, s->nnz - s->numvals, s->numvals, t, temp_storage.i32);
             break;
           case LONG:
           case TIMESTAMP:
             n = IntegerRLE<CI_DATA, int64_t, true, 0x3ff, block_size>(
-              s, s->vals.i64, s->nnz - s->numvals, s->numvals, flush, t, temp_storage.i64);
+              s, s->vals.i64, s->nnz - s->numvals, s->numvals, t, temp_storage.i64);
             break;
           case BYTE:
             n = ByteRLE<CI_DATA, 0x3ff>(s, s->vals.u8, s->nnz - s->numvals, s->numvals, flush, t);
@@ -933,7 +931,7 @@ __global__ void __launch_bounds__(block_size)
           case STRING:
             if (s->chunk.encoding_kind == DICTIONARY_V2) {
               n = IntegerRLE<CI_DATA, uint32_t, false, 0x3ff, block_size>(
-                s, s->vals.u32, s->nnz - s->numvals, s->numvals, flush, t, temp_storage.u32);
+                s, s->vals.u32, s->nnz - s->numvals, s->numvals, t, temp_storage.u32);
             } else {
               n = s->numvals;
             }
@@ -956,17 +954,18 @@ __global__ void __launch_bounds__(block_size)
       }
       // Encode secondary stream values
       if (s->numlengths > 0) {
-        uint32_t flush = (s->cur_row == s->chunk.num_rows) ? 1 : 0, n;
+        uint32_t n;
         switch (s->chunk.type_kind) {
           case TIMESTAMP:
             n = IntegerRLE<CI_DATA2, uint64_t, false, 0x3ff, block_size>(
-              s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u64);
+              s, s->lengths.u64, s->nnz - s->numlengths, s->numlengths, t, temp_storage.u64);
             break;
           case DECIMAL:
           case LIST:
+          case MAP:
           case STRING:
             n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
-              s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, flush, t, temp_storage.u32);
+              s, s->lengths.u32, s->nnz - s->numlengths, s->numlengths, t, temp_storage.u32);
             break;
           default: n = s->numlengths; break;
         }
@@ -1059,9 +1058,8 @@ __global__ void __launch_bounds__(block_size)
       if (t < numvals) s->lengths.u32[nz_idx] = count;
       __syncthreads();
       if (s->numlengths + numvals > 0) {
-        uint32_t flush = (s->cur_row + numvals == s->nrows) ? 1 : 0;
-        uint32_t n     = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
-          s, s->lengths.u32, s->cur_row, s->numlengths + numvals, flush, t, temp_storage);
+        uint32_t n = IntegerRLE<CI_DATA2, uint32_t, false, 0x3ff, block_size>(
+          s, s->lengths.u32, s->cur_row, s->numlengths + numvals, t, temp_storage);
         __syncthreads();
         if (!t) {
           s->numlengths += numvals;
@@ -1312,9 +1310,7 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
   gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(
     strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size);
   if (compression == SNAPPY) {
-    auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
-    bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
-    if (use_nvcomp) {
+    if (detail::nvcomp_integration::is_stable_enabled()) {
       try {
         size_t temp_size;
         nvcompStatus_t nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 299c8fbb730..1e580e360ca 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -80,7 +80,7 @@ orc::CompressionKind to_orc_compression(compression_type compression)
 /**
  * @brief Function that translates GDF dtype to ORC datatype
  */
-constexpr orc::TypeKind to_orc_type(cudf::type_id id)
+constexpr orc::TypeKind to_orc_type(cudf::type_id id, bool list_column_as_map)
 {
   switch (id) {
     case cudf::type_id::INT8: return TypeKind::BYTE;
@@ -98,7 +98,7 @@ constexpr orc::TypeKind to_orc_type(cudf::type_id id)
     case cudf::type_id::STRING: return TypeKind::STRING;
     case cudf::type_id::DECIMAL32:
     case cudf::type_id::DECIMAL64: return TypeKind::DECIMAL;
-    case cudf::type_id::LIST: return TypeKind::LIST;
+    case cudf::type_id::LIST: return list_column_as_map ? TypeKind::MAP : TypeKind::LIST;
     case cudf::type_id::STRUCT: return TypeKind::STRUCT;
     default: return TypeKind::INVALID_TYPE_KIND;
   }
@@ -151,11 +151,11 @@ class orc_column_view {
       _str_idx{str_idx},
       _is_child{parent != nullptr},
       _type_width{cudf::is_fixed_width(col.type()) ? cudf::size_of(col.type()) : 0},
-      _scale{(to_orc_type(col.type().id()) == TypeKind::DECIMAL) ? -col.type().scale()
-                                                                 : to_clockscale(col.type().id())},
+      _type_kind{to_orc_type(col.type().id(), metadata.is_map())},
+      _scale{(_type_kind == TypeKind::DECIMAL) ? -col.type().scale()
+                                               : to_clockscale(col.type().id())},
       _precision{metadata.is_decimal_precision_set() ? metadata.get_decimal_precision()
                                                      : orc_precision(col.type().id())},
-      _type_kind{to_orc_type(col.type().id())},
       name{metadata.get_name()}
   {
     if (metadata.is_nullability_defined()) { nullable_from_metadata = metadata.nullable(); }
@@ -163,6 +163,13 @@ class orc_column_view {
       parent->add_child(_index);
       _parent_index = parent->index();
     }
+
+    if (_type_kind == TypeKind::MAP) {
+      auto const struct_col = col.child(lists_column_view::child_column_index);
+      CUDF_EXPECTS(struct_col.null_count() == 0,
+                   "struct column of a MAP column should not have null elements");
+      CUDF_EXPECTS(struct_col.num_children() == 2, "MAP column must have two child columns");
+    }
   }
 
   void add_child(uint32_t child_idx) { children.emplace_back(child_idx); }
@@ -215,6 +222,7 @@ class orc_column_view {
   auto parent_index() const noexcept { return _parent_index.value(); }
   auto child_begin() const noexcept { return children.cbegin(); }
   auto child_end() const noexcept { return children.cend(); }
+  auto num_children() const noexcept { return children.size(); }
 
   auto type_width() const noexcept { return _type_width; }
   auto size() const noexcept { return cudf_column.size(); }
@@ -241,15 +249,15 @@ class orc_column_view {
   int _str_idx;
   bool _is_child = false;
 
-  size_t _type_width = 0;
-  int32_t _scale     = 0;
-  int32_t _precision = 0;
-
   // ORC-related members
   TypeKind _type_kind               = INVALID_TYPE_KIND;
   ColumnEncodingKind _encoding_kind = INVALID_ENCODING_KIND;
   std::string name;
 
+  size_t _type_width = 0;
+  int32_t _scale     = 0;
+  int32_t _precision = 0;
+
   // String dictionary-related members
   size_t _dict_stride                        = 0;
   gpu::DictionaryChunk const* dict           = nullptr;
@@ -276,23 +284,18 @@ size_type orc_table_view::num_rows() const noexcept
  *
  * @param columns List of columns
  * @param rowgroup_bounds Ranges of rows in each rowgroup [rowgroup][column]
- * @param max_stripe_bytes Maximum size of each stripe, in bytes
+ * @param max_stripe_size Maximum size of each stripe, both in bytes and in rows
  * @return List of stripe descriptors
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         uint32_t max_stripe_bytes)
+                                         stripe_size_limits max_stripe_size)
 {
-  auto const is_any_column_string =
-    std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.is_string(); });
-  // Apply rows per stripe limit to limit string dictionaries
-  size_t const max_stripe_rows = is_any_column_string ? 1000000 : 5000000;
-
   std::vector<stripe_rowgroups> infos;
   auto const num_rowgroups = rowgroup_bounds.size().first;
   size_t stripe_start      = 0;
   size_t stripe_bytes      = 0;
-  size_t stripe_rows       = 0;
+  size_type stripe_rows    = 0;
   for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
@@ -311,8 +314,8 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
                        [](auto& l, auto& r) { return l.size() < r.size(); })
         ->size();
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
-    if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_bytes ||
-                                    stripe_rows + rowgroup_rows_max > max_stripe_rows)) {
+    if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
+                                    stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
       infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
       stripe_start = rg_idx;
       stripe_bytes = 0;
@@ -443,6 +446,16 @@ void writer::impl::build_dictionaries(orc_table_view& orc_table,
   stripe_dict.device_to_host(stream, true);
 }
 
+/**
+ * @brief Returns the maximum size of RLE encoded values of an integer type.
+ **/
+template <typename T>
+size_t max_varint_size()
+{
+  // varint encodes 7 bits in each byte
+  return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7);
+}
+
 constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
 {
   using cudf::util::div_rounding_up_unsafe;
@@ -454,16 +467,16 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
       return div_rounding_up_unsafe(count, byte_rle_max_len) * (byte_rle_max_len + 1);
     case TypeKind::SHORT:
       return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * sizeof(int16_t) + 2);
+             (gpu::encode_block_size * max_varint_size<int16_t>() + 2);
     case TypeKind::FLOAT:
     case TypeKind::INT:
     case TypeKind::DATE:
       return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * sizeof(int32_t) + 2);
+             (gpu::encode_block_size * max_varint_size<int32_t>() + 2);
     case TypeKind::LONG:
     case TypeKind::DOUBLE:
       return div_rounding_up_unsafe(count, gpu::encode_block_size) *
-             (gpu::encode_block_size * sizeof(int64_t) + 2);
+             (gpu::encode_block_size * max_varint_size<int64_t>() + 2);
     default: CUDF_FAIL("Unsupported ORC type for RLE stream size");
   }
 }
@@ -604,6 +617,7 @@ orc_streams writer::impl::create_streams(host_span<orc_column_view> columns,
         column.set_orc_encoding(DIRECT_V2);
         break;
       case TypeKind::LIST:
+      case TypeKind::MAP:
         // no data stream, only lengths
         add_RLE_stream(gpu::CI_DATA2, LENGTH, TypeKind::INT);
         column.set_orc_encoding(DIRECT_V2);
@@ -1150,7 +1164,6 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   row_group_index_info present;
   row_group_index_info data;
   row_group_index_info data2;
-  auto kind            = TypeKind::STRUCT;
   auto const column_id = stream_id - 1;
 
   auto find_record = [=, &strm_desc](gpu::encoder_chunk_streams const& stream,
@@ -1183,6 +1196,7 @@ void writer::impl::write_index_stream(int32_t stripe_id,
     }
   };
 
+  auto kind = TypeKind::STRUCT;
   // TBD: Not sure we need an empty index stream for column 0
   if (stream_id != 0) {
     const auto& strm = enc_streams[column_id][0];
@@ -1223,30 +1237,36 @@ void writer::impl::write_index_stream(int32_t stripe_id,
   stripe->indexLength += buffer_.size();
 }
 
-void writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
-                                     gpu::encoder_chunk_streams const& enc_stream,
-                                     uint8_t const* compressed_data,
-                                     uint8_t* stream_out,
-                                     StripeInformation* stripe,
-                                     orc_streams* streams)
+std::future<void> writer::impl::write_data_stream(gpu::StripeStream const& strm_desc,
+                                                  gpu::encoder_chunk_streams const& enc_stream,
+                                                  uint8_t const* compressed_data,
+                                                  uint8_t* stream_out,
+                                                  StripeInformation* stripe,
+                                                  orc_streams* streams)
 {
   const auto length                                        = strm_desc.stream_size;
   (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
-  if (length == 0) { return; }
+  if (length == 0) {
+    return std::async(std::launch::deferred, [] {});
+  }
 
   const auto* stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
                                                       : (compressed_data + strm_desc.bfr_offset);
 
-  if (out_sink_->is_device_write_preferred(length)) {
-    out_sink_->device_write(stream_in, length, stream);
-  } else {
-    CUDA_TRY(
-      cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
-    stream.synchronize();
+  auto write_task = [&]() {
+    if (out_sink_->is_device_write_preferred(length)) {
+      return out_sink_->device_write_async(stream_in, length, stream);
+    } else {
+      CUDA_TRY(
+        cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
+      stream.synchronize();
 
-    out_sink_->host_write(stream_out, length);
-  }
+      out_sink_->host_write(stream_out, length);
+      return std::async(std::launch::deferred, [] {});
+    }
+  }();
   stripe->dataLength += length;
+  return write_task;
 }
 
 void writer::impl::add_uncompressed_block_headers(std::vector<uint8_t>& v)
@@ -1276,6 +1296,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
+    row_index_stride{options.row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
@@ -1294,6 +1316,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
                    rmm::mr::device_memory_resource* mr)
   : _mr(mr),
     stream(stream),
+    max_stripe_size{options.stripe_size_bytes(), options.stripe_size_rows()},
+    row_index_stride{options.row_index_stride()},
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.enable_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
@@ -1365,7 +1389,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
-    if (col.orc_kind() != LIST && col.orc_kind() != STRUCT) {
+    if (col.num_children() == 0) {
       mask_ptrs.emplace_back(nullptr);
       continue;
     }
@@ -1396,10 +1420,10 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                           thrust::bit_and<bitmask_type>());
       }
     }
-    if (col.orc_kind() == LIST) {
+    if (col.orc_kind() == LIST or col.orc_kind() == MAP) {
       // Need a new pushdown mask unless both the parent and current colmn are not nullable
       auto const child_col = orc_table.column(col.child_begin()[0]);
-      // pushdown mask applies to child column; use the child column size
+      // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
       mask_ptrs.emplace_back(pd_masks.back().data());
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
@@ -1462,14 +1486,20 @@ orc_table_view make_orc_table_view(table_view const& table,
         orc_columns.emplace_back(new_col_idx, str_idx, parent_col, col, col_meta);
         if (orc_columns[new_col_idx].is_string()) { str_col_indexes.push_back(new_col_idx); }
 
-        if (col.type().id() == type_id::LIST) {
+        auto const kind = orc_columns[new_col_idx].orc_kind();
+        if (kind == TypeKind::LIST) {
           append_orc_column(col.child(lists_column_view::child_column_index),
                             &orc_columns[new_col_idx],
                             col_meta.child(lists_column_view::child_column_index));
-        } else if (col.type().id() == type_id::STRUCT) {
-          for (auto child_idx = 0; child_idx != col.num_children(); ++child_idx)
-            append_orc_column(
-              col.child(child_idx), &orc_columns[new_col_idx], col_meta.child(child_idx));
+        } else if (kind == TypeKind::STRUCT or kind == TypeKind::MAP) {
+          // MAP: skip to the list child - include grandchildren columns instead of children
+          auto const real_parent_col =
+            kind == TypeKind::MAP ? col.child(lists_column_view::child_column_index) : col;
+          for (auto child_idx = 0; child_idx != real_parent_col.num_children(); ++child_idx) {
+            append_orc_column(real_parent_col.child(child_idx),
+                              &orc_columns[new_col_idx],
+                              col_meta.child(child_idx));
+          }
         }
       };
 
@@ -1477,6 +1507,14 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
+  std::vector<TypeKind> type_kinds;
+  type_kinds.reserve(orc_columns.size());
+  std::transform(
+    orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
+      return orc_column.orc_kind();
+    });
+  auto const d_type_kinds = cudf::detail::make_device_uvector_async(type_kinds, stream);
+
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
   rmm::device_uvector<stack_value_type> stack_storage(orc_columns.size(), stream);
@@ -1484,6 +1522,7 @@ orc_table_view make_orc_table_view(table_view const& table,
   // pre-order append ORC device columns
   cudf::detail::device_single_thread(
     [d_orc_cols         = device_span<orc_column_device_view>{d_orc_columns},
+     d_type_kinds       = device_span<TypeKind const>{d_type_kinds},
      d_table            = d_table,
      stack_storage      = stack_storage.data(),
      stack_storage_size = stack_storage.size()] __device__() {
@@ -1501,6 +1540,11 @@ orc_table_view make_orc_table_view(table_view const& table,
         auto [col, parent] = stack.pop();
         d_orc_cols[idx]    = orc_column_device_view{*col, parent};
 
+        if (d_type_kinds[idx] == TypeKind::MAP) {
+          // Skip to the list child - do not include the child column, just grandchildren columns
+          col = &col->children()[lists_column_view::child_column_index];
+        }
+
         if (col->type().id() == type_id::LIST) {
           stack.push({&col->children()[lists_column_view::child_column_index], idx});
         } else if (col->type().id() == type_id::STRUCT) {
@@ -1511,7 +1555,7 @@ orc_table_view make_orc_table_view(table_view const& table,
                              stack.push({&c, idx});
                            });
         }
-        idx++;
+        ++idx;
       }
     },
     stream);
@@ -1736,7 +1780,7 @@ void writer::impl::write(table_view const& table)
 
   auto const pd_masks = init_pushdown_null_masks(orc_table, stream);
 
-  auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride_, stream);
+  auto rowgroup_bounds = calculate_rowgroup_bounds(orc_table, row_index_stride, stream);
 
   // Build per-column dictionary indices
   auto dictionaries = allocate_dictionaries(orc_table, rowgroup_bounds, stream);
@@ -1753,7 +1797,7 @@ void writer::impl::write(table_view const& table)
 
   // Decide stripe boundaries based on rowgroups and dict chunks
   auto const segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size_);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
 
   // Build stripe-level dictionaries
   hostdevice_2dvector<gpu::StripeDictionary> stripe_dict(
@@ -1850,6 +1894,7 @@ void writer::impl::write(table_view const& table)
   ProtobufWriter pbw_(&buffer_);
 
   // Write stripes
+  std::vector<std::future<void>> write_tasks;
   for (size_t stripe_id = 0; stripe_id < stripes.size(); ++stripe_id) {
     auto const& rowgroups_range = segmentation.stripes[stripe_id];
     auto& stripe                = stripes[stripe_id];
@@ -1872,12 +1917,13 @@ void writer::impl::write(table_view const& table)
 
     // Column data consisting one or more separate streams
     for (auto const& strm_desc : strm_descs[stripe_id]) {
-      write_data_stream(strm_desc,
-                        enc_data.streams[strm_desc.column_id][rowgroups_range.first],
-                        static_cast<uint8_t*>(compressed_data.data()),
-                        stream_output.get(),
-                        &stripe,
-                        &streams);
+      write_tasks.push_back(
+        write_data_stream(strm_desc,
+                          enc_data.streams[strm_desc.column_id][rowgroups_range.first],
+                          static_cast<uint8_t const*>(compressed_data.data()),
+                          stream_output.get(),
+                          &stripe,
+                          &streams));
     }
 
     // Write stripefooter consisting of stream information
@@ -1904,6 +1950,9 @@ void writer::impl::write(table_view const& table)
     }
     out_sink_->host_write(buffer_.data(), buffer_.size());
   }
+  for (auto const& task : write_tasks) {
+    task.wait();
+  }
 
   if (column_stats.size() != 0) {
     // File-level statistics
@@ -1942,7 +1991,7 @@ void writer::impl::write(table_view const& table)
   if (ff.headerLength == 0) {
     // First call
     ff.headerLength   = std::strlen(MAGIC);
-    ff.rowIndexStride = row_index_stride_;
+    ff.rowIndexStride = row_index_stride;
     ff.types.resize(1 + orc_table.num_columns());
     ff.types[0].kind = STRUCT;
     for (auto const& column : orc_table.columns) {
@@ -1958,13 +2007,11 @@ void writer::impl::write(table_view const& table)
         schema_type.scale     = static_cast<uint32_t>(column.scale());
         schema_type.precision = column.precision();
       }
-      // In preorder traversal the column after a list column is always the child column
-      if (column.orc_kind() == LIST) { schema_type.subtypes.emplace_back(column.id() + 1); }
+      std::transform(column.child_begin(),
+                     column.child_end(),
+                     std::back_inserter(schema_type.subtypes),
+                     [&](auto const& child_idx) { return orc_table.column(child_idx).id(); });
       if (column.orc_kind() == STRUCT) {
-        std::transform(column.child_begin(),
-                       column.child_end(),
-                       std::back_inserter(schema_type.subtypes),
-                       [&](auto const& child_idx) { return orc_table.column(child_idx).id(); });
         std::transform(column.child_begin(),
                        column.child_end(),
                        std::back_inserter(schema_type.fieldNames),
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index a8fe22a360f..89b8c305424 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -166,6 +166,14 @@ struct string_dictionaries {
   thrust::host_vector<bool> dictionary_enabled;
 };
 
+/**
+ * @brief Maximum size of stripes in the output file.
+ */
+struct stripe_size_limits {
+  size_t bytes;
+  size_type rows;
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -173,9 +181,6 @@ class writer::impl {
   // ORC datasets start with a 3 byte header
   static constexpr const char* MAGIC = "ORC";
 
-  // ORC datasets are divided into fixed-size, independent stripes
-  static constexpr uint32_t DEFAULT_STRIPE_SIZE = 64 * 1024 * 1024;
-
   // ORC compresses streams into independent chunks
   static constexpr uint32_t DEFAULT_COMPRESSION_BLOCKSIZE = 256 * 1024;
 
@@ -325,13 +330,14 @@ class writer::impl {
    * @param[in,out] stream_out Temporary host output buffer
    * @param[in,out] stripe Stream's parent stripe
    * @param[in,out] streams List of all streams
+   * @return An std::future that should be synchronized to ensure the writing is complete
    */
-  void write_data_stream(gpu::StripeStream const& strm_desc,
-                         gpu::encoder_chunk_streams const& enc_stream,
-                         uint8_t const* compressed_data,
-                         uint8_t* stream_out,
-                         StripeInformation* stripe,
-                         orc_streams* streams);
+  std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
+                                      gpu::encoder_chunk_streams const& enc_stream,
+                                      uint8_t const* compressed_data,
+                                      uint8_t* stream_out,
+                                      StripeInformation* stripe,
+                                      orc_streams* streams);
 
   /**
    * @brief Insert 3-byte uncompressed block headers in a byte vector
@@ -345,8 +351,8 @@ class writer::impl {
   // Cuda stream to be used
   rmm::cuda_stream_view stream = rmm::cuda_stream_default;
 
-  size_t max_stripe_size_           = DEFAULT_STRIPE_SIZE;
-  size_t row_index_stride_          = default_row_index_stride;
+  stripe_size_limits max_stripe_size;
+  size_type row_index_stride;
   size_t compression_blocksize_     = DEFAULT_COMPRESSION_BLOCKSIZE;
   CompressionKind compression_kind_ = CompressionKind::NONE;
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index ebc655578f7..337d9faec20 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -623,7 +623,7 @@ inline __device__ void gpuStoreOutput(uint2* dst,
  *
  * @param[in,out] s Page state input/output
  * @param[in] src_pos Source position
- * @param[in] dst Pointer to row output data
+ * @param[out] dst Pointer to row output data
  */
 inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src_pos, int64_t* dst)
 {
@@ -631,7 +631,6 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src
 
   const uint8_t* src8;
   uint32_t dict_pos, dict_size = s->dict_size, ofs;
-  int64_t ts;
 
   if (s->dict_base) {
     // Dictionary
@@ -646,36 +645,46 @@ inline __device__ void gpuOutputInt96Timestamp(volatile page_state_s* s, int src
   ofs = 3 & reinterpret_cast<size_t>(src8);
   src8 -= ofs;  // align to 32-bit boundary
   ofs <<= 3;    // bytes -> bits
-  if (dict_pos + 4 < dict_size) {
-    uint3 v;
-    int64_t nanos, days;
-    v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
-    v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
-    v.z = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
-    if (ofs) {
-      uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 12);
-      v.x           = __funnelshift_r(v.x, v.y, ofs);
-      v.y           = __funnelshift_r(v.y, v.z, ofs);
-      v.z           = __funnelshift_r(v.z, next, ofs);
-    }
-    nanos = v.y;
-    nanos <<= 32;
-    nanos |= v.x;
-    // Convert from Julian day at noon to UTC seconds
-    days = static_cast<int32_t>(v.z);
-    cudf::duration_D d{
-      days - 2440588};  // TBD: Should be noon instead of midnight, but this matches pyarrow
-    if (s->col.ts_clock_rate) {
-      int64_t secs = duration_cast<cudf::duration_s>(d).count() +
-                     duration_cast<cudf::duration_s>(cudf::duration_ns{nanos}).count();
-      ts = secs * s->col.ts_clock_rate;  // Output to desired clock rate
-    } else {
-      ts = duration_cast<cudf::duration_ns>(d).count() + nanos;
-    }
-  } else {
-    ts = 0;
+
+  if (dict_pos + 4 >= dict_size) {
+    *dst = 0;
+    return;
   }
-  *dst = ts;
+
+  uint3 v;
+  int64_t nanos, days;
+  v.x = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 0);
+  v.y = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 4);
+  v.z = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 8);
+  if (ofs) {
+    uint32_t next = *reinterpret_cast<const uint32_t*>(src8 + dict_pos + 12);
+    v.x           = __funnelshift_r(v.x, v.y, ofs);
+    v.y           = __funnelshift_r(v.y, v.z, ofs);
+    v.z           = __funnelshift_r(v.z, next, ofs);
+  }
+  nanos = v.y;
+  nanos <<= 32;
+  nanos |= v.x;
+  // Convert from Julian day at noon to UTC seconds
+  days = static_cast<int32_t>(v.z);
+  cudf::duration_D d_d{
+    days - 2440588};  // TBD: Should be noon instead of midnight, but this matches pyarrow
+
+  *dst = [&]() {
+    switch (s->col.ts_clock_rate) {
+      case 1:  // seconds
+        return duration_cast<duration_s>(d_d).count() +
+               duration_cast<duration_s>(duration_ns{nanos}).count();
+      case 1'000:  // milliseconds
+        return duration_cast<duration_ms>(d_d).count() +
+               duration_cast<duration_ms>(duration_ns{nanos}).count();
+      case 1'000'000:  // microseconds
+        return duration_cast<duration_us>(d_d).count() +
+               duration_cast<duration_us>(duration_ns{nanos}).count();
+      case 1'000'000'000:  // nanoseconds
+      default: return duration_cast<cudf::duration_ns>(d_d).count() + nanos;
+    }
+  }();
 }
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 06a696d6751..f144a02bc89 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -22,6 +22,7 @@
 #include "reader_impl.hpp"
 
 #include <io/comp/gpuinflate.h>
+#include <io/utilities/config_utils.hpp>
 #include <io/utilities/time_utils.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -1154,9 +1155,6 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                cudaMemcpyHostToDevice,
                                stream.value()));
 
-      auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
-      bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
-
       switch (codec.compression_type) {
         case parquet::GZIP:
           CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos),
@@ -1166,7 +1164,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
                               stream))
           break;
         case parquet::SNAPPY:
-          if (use_nvcomp) {
+          if (nvcomp_integration::is_stable_enabled()) {
             snappy_decompress(inflate_in_view.subspan(start_pos, argc - start_pos),
                               inflate_out_view.subspan(start_pos, argc - start_pos),
                               codec.max_decompressed_size,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 0382a7bb7ba..2ab5d7d696b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -23,6 +23,7 @@
 #include "writer_impl.hpp"
 
 #include <io/utilities/column_utils.cuh>
+#include <io/utilities/config_utils.hpp>
 #include "compact_protocol_writer.hpp"
 
 #include <cudf/column/column_device_view.cuh>
@@ -990,11 +991,9 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   device_span<gpu_inflate_status_s> comp_stat{compression_status.data(), compression_status.size()};
 
   gpu::EncodePages(batch_pages, comp_in, comp_stat, stream);
-  auto env_use_nvcomp = std::getenv("LIBCUDF_USE_NVCOMP");
-  bool use_nvcomp     = env_use_nvcomp != nullptr ? std::atoi(env_use_nvcomp) : 0;
   switch (compression_) {
     case parquet::Compression::SNAPPY:
-      if (use_nvcomp) {
+      if (nvcomp_integration::is_stable_enabled()) {
         snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream);
       } else {
         CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream));
@@ -1379,6 +1378,7 @@ void writer::impl::write(table_view const& table)
       (stats_granularity_ == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                                : nullptr);
+    std::vector<std::future<void>> write_tasks;
     for (; r < rnext; r++, global_r++) {
       for (auto i = 0; i < num_columns; i++) {
         gpu::EncColumnChunk* ck = &chunks[r][i];
@@ -1392,7 +1392,8 @@ void writer::impl::write(table_view const& table)
 
         if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
-          out_sink_->device_write(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream);
+          write_tasks.push_back(
+            out_sink_->device_write_async(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream));
           // we still need to do a (much smaller) memcpy for the statistics.
           if (ck->ck_stat_size != 0) {
             md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
@@ -1438,6 +1439,9 @@ void writer::impl::write(table_view const& table)
         current_chunk_offset += ck->compressed_size;
       }
     }
+    for (auto const& task : write_tasks) {
+      task.wait();
+    }
   }
 }
 
diff --git a/cpp/src/io/statistics/column_statistics.cuh b/cpp/src/io/statistics/column_statistics.cuh
index 8dd4afb0c4d..7bca5f6b301 100644
--- a/cpp/src/io/statistics/column_statistics.cuh
+++ b/cpp/src/io/statistics/column_statistics.cuh
@@ -95,16 +95,18 @@ struct calculate_group_statistics_functor {
 
     using type_convert = detail::type_conversion<detail::conversion_map<IO>>;
     using CT           = typename type_convert::template type<T>;
-    typed_statistics_chunk<CT, detail::statistics_type_category<T, IO>::include_aggregate> chunk(
-      s.group.num_rows);
+    typed_statistics_chunk<CT, detail::statistics_type_category<T, IO>::include_aggregate> chunk;
 
     for (uint32_t i = 0; i < s.group.num_rows; i += block_size) {
-      uint32_t r          = i + t;
-      uint32_t row        = r + s.group.start_row;
-      auto const is_valid = (r < s.group.num_rows) ? s.col.leaf_column->is_valid(row) : 0;
-      if (is_valid) {
-        auto converted_value = type_convert::convert(s.col.leaf_column->element<T>(row));
-        chunk.reduce(converted_value);
+      uint32_t r   = i + t;
+      uint32_t row = r + s.group.start_row;
+      if (r < s.group.num_rows) {
+        if (s.col.leaf_column->is_valid(row)) {
+          auto converted_value = type_convert::convert(s.col.leaf_column->element<T>(row));
+          chunk.reduce(converted_value);
+        } else {
+          chunk.null_count++;
+        }
       }
     }
 
@@ -120,13 +122,18 @@ struct calculate_group_statistics_functor {
   __device__ void operator()(stats_state_s& s, uint32_t t)
   {
     detail::storage_wrapper<block_size> storage(temp_storage);
-    typed_statistics_chunk<uint32_t, false> chunk(s.group.num_rows);
+    typed_statistics_chunk<uint32_t, false> chunk;
 
     for (uint32_t i = 0; i < s.group.num_rows; i += block_size) {
-      uint32_t r          = i + t;
-      uint32_t row        = r + s.group.start_row;
-      auto const is_valid = (r < s.group.num_rows) ? s.col.leaf_column->is_valid(row) : 0;
-      chunk.non_nulls += is_valid;
+      uint32_t r   = i + t;
+      uint32_t row = r + s.group.start_row;
+      if (r < s.group.num_rows) {
+        if (s.col.leaf_column->is_valid(row)) {
+          chunk.non_nulls++;
+        } else {
+          chunk.null_count++;
+        }
+      }
     }
     cub::BlockReduce<uint32_t, block_size>(storage.template get<uint32_t>()).Sum(chunk.non_nulls);
 
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 9211ba13d26..0992a557491 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -92,7 +92,6 @@ struct typed_statistics_chunk<T, true> {
   using E = typename detail::extrema_type<T>::type;
   using A = typename detail::aggregation_type<T>::type;
 
-  uint32_t num_rows;    //!< number of non-null values in chunk
   uint32_t non_nulls;   //!< number of non-null values in chunk
   uint32_t null_count;  //!< number of null values in chunk
 
@@ -103,9 +102,8 @@ struct typed_statistics_chunk<T, true> {
   uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
   uint8_t has_sum;     //!< Nonzero if sum is valid
 
-  __device__ typed_statistics_chunk(const uint32_t _num_rows = 0)
-    : num_rows(_num_rows),
-      non_nulls(0),
+  __device__ typed_statistics_chunk()
+    : non_nulls(0),
       null_count(0),
       minimum_value(detail::minimum_identity<E>()),
       maximum_value(detail::maximum_identity<E>()),
@@ -135,7 +133,6 @@ struct typed_statistics_chunk<T, true> {
     }
     non_nulls += chunk.non_nulls;
     null_count += chunk.null_count;
-    num_rows += (chunk.non_nulls + chunk.null_count);
   }
 };
 
@@ -143,7 +140,6 @@ template <typename T>
 struct typed_statistics_chunk<T, false> {
   using E = typename detail::extrema_type<T>::type;
 
-  uint32_t num_rows;    //!< number of non-null values in chunk
   uint32_t non_nulls;   //!< number of non-null values in chunk
   uint32_t null_count;  //!< number of null values in chunk
 
@@ -153,9 +149,8 @@ struct typed_statistics_chunk<T, false> {
   uint8_t has_minmax;  //!< Nonzero if min_value and max_values are valid
   uint8_t has_sum;     //!< Nonzero if sum is valid
 
-  __device__ typed_statistics_chunk(const uint32_t _num_rows = 0)
-    : num_rows(_num_rows),
-      non_nulls(0),
+  __device__ typed_statistics_chunk()
+    : non_nulls(0),
       null_count(0),
       minimum_value(detail::minimum_identity<E>()),
       maximum_value(detail::maximum_identity<E>()),
@@ -180,7 +175,6 @@ struct typed_statistics_chunk<T, false> {
     }
     non_nulls += chunk.non_nulls;
     null_count += chunk.null_count;
-    num_rows += (chunk.non_nulls + chunk.null_count);
   }
 };
 
@@ -240,7 +234,7 @@ get_untyped_chunk(const typed_statistics_chunk<T, include_aggregate>& chunk)
   using E = typename detail::extrema_type<T>::type;
   statistics_chunk stat{};
   stat.non_nulls  = chunk.non_nulls;
-  stat.null_count = chunk.num_rows - chunk.non_nulls;
+  stat.null_count = chunk.null_count;
   stat.has_minmax = chunk.has_minmax;
   stat.has_sum    = [&]() {
     if (!chunk.has_minmax) return false;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 40a5d411290..d328a831708 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -144,7 +144,7 @@ std::unique_ptr<column> empty_like(column_buffer& buffer,
   switch (buffer.type.id()) {
     case type_id::LIST: {
       // make offsets column
-      auto offsets = cudf::make_empty_column(data_type{type_id::INT32});
+      auto offsets = cudf::make_empty_column(type_id::INT32);
 
       column_name_info* child_info = nullptr;
       if (schema_info != nullptr) {
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp
new file mode 100644
index 00000000000..a1d8e747e44
--- /dev/null
+++ b/cpp/src/io/utilities/config_utils.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cstdlib>
+#include <string>
+
+namespace cudf::io::detail {
+
+/**
+ * @brief Returns the value of the environment variable, or a default value if the variable is not
+ * present.
+ */
+inline std::string getenv_or(std::string const& env_var_name, std::string_view default_val)
+{
+  auto const env_val = std::getenv(env_var_name.c_str());
+  return std::string{(env_val == nullptr) ? default_val : env_val};
+}
+
+namespace nvcomp_integration {
+
+namespace {
+/**
+ * @brief Defines which nvCOMP usage to enable.
+ */
+enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
+
+/**
+ * @brief Get the current usage policy.
+ */
+inline usage_policy get_env_policy()
+{
+  static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
+  if (env_val == "OFF") return usage_policy::OFF;
+  if (env_val == "ALWAYS") return usage_policy::ALWAYS;
+  return usage_policy::STABLE;
+}
+}  // namespace
+
+/**
+ * @brief Returns true if all nvCOMP uses are enabled.
+ */
+inline bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; }
+
+/**
+ * @brief Returns true if stable nvCOMP use is enabled.
+ */
+inline bool is_stable_enabled()
+{
+  return is_all_enabled() or get_env_policy() == usage_policy::STABLE;
+}
+
+}  // namespace nvcomp_integration
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index d133d813ab3..a0ed54b96ef 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -64,6 +64,17 @@ class file_sink : public data_sink {
     _bytes_written += size;
   }
 
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
+    if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file.");
+
+    auto result = _cufile_out->write_async(gpu_data, _bytes_written, size);
+    _bytes_written += size;
+    return result;
+  }
+
  private:
   std::ofstream _output_stream;
   size_t _bytes_written = 0;
@@ -111,6 +122,14 @@ class void_sink : public data_sink {
     _bytes_written += size;
   }
 
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
+    _bytes_written += size;
+    return std::async(std::launch::deferred, [] {});
+  }
+
   void flush() override {}
 
   size_t bytes_written() override { return _bytes_written; }
@@ -132,10 +151,19 @@ class user_sink_wrapper : public data_sink {
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
     CUDF_EXPECTS(user_sink->supports_device_write(),
-                 "device_write() being called on a data_sink that doesn't support it");
+                 "device_write() was called on a data_sink that doesn't support it");
     user_sink->device_write(gpu_data, size, stream);
   }
 
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
+    CUDF_EXPECTS(user_sink->supports_device_write(),
+                 "device_write_async() was called on a data_sink that doesn't support it");
+    return user_sink->device_write_async(gpu_data, size, stream);
+  }
+
   void flush() override { user_sink->flush(); }
 
   size_t bytes_written() override { return user_sink->bytes_written(); }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 7b55bf82f15..387452e171a 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -15,6 +15,7 @@
  */
 #include "file_io_utilities.hpp"
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <io/utilities/config_utils.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -48,12 +49,6 @@ file_wrapper::file_wrapper(std::string const& filepath, int flags, mode_t mode)
 
 file_wrapper::~file_wrapper() { close(fd); }
 
-std::string getenv_or(std::string const& env_var_name, std::string const& default_val)
-{
-  auto const env_val = std::getenv(env_var_name.c_str());
-  return (env_val == nullptr) ? default_val : std::string(env_val);
-}
-
 #ifdef CUFILE_FOUND
 
 cufile_config::cufile_config() : policy{getenv_or("LIBCUDF_CUFILE_POLICY", default_policy)}
@@ -185,6 +180,31 @@ std::unique_ptr<datasource::buffer> cufile_input_impl::read(size_t offset,
   return datasource::buffer::create(std::move(out_data));
 }
 
+namespace {
+
+template <typename DataT,
+          typename F,
+          typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
+std::vector<std::future<ResultT>> make_sliced_tasks(
+  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+{
+  std::vector<std::future<ResultT>> slice_tasks;
+  constexpr size_t max_slice_bytes = 4 * 1024 * 1024;
+  size_t const n_slices            = util::div_rounding_up_safe(size, max_slice_bytes);
+  size_t slice_offset              = 0;
+  for (size_t t = 0; t < n_slices; ++t) {
+    DataT* ptr_slice = ptr + slice_offset;
+
+    size_t const slice_size = (t == n_slices - 1) ? size % max_slice_bytes : max_slice_bytes;
+    slice_tasks.push_back(pool.submit(function, ptr_slice, slice_size, offset + slice_offset));
+
+    slice_offset += slice_size;
+  }
+  return slice_tasks;
+}
+
+}  // namespace
+
 std::future<size_t> cufile_input_impl::read_async(size_t offset,
                                                   size_t size,
                                                   uint8_t* dst,
@@ -193,32 +213,22 @@ std::future<size_t> cufile_input_impl::read_async(size_t offset,
   int device;
   cudaGetDevice(&device);
 
-  auto read_slice = [=](void* dst, size_t size, size_t offset) -> ssize_t {
+  auto read_slice = [device, gds_read = shim->read, file_handle = cf_file.handle()](
+                      void* dst, size_t size, size_t offset) -> ssize_t {
     cudaSetDevice(device);
-    auto read_size = shim->read(cf_file.handle(), dst, size, offset, 0);
+    auto read_size = gds_read(file_handle, dst, size, offset, 0);
     CUDF_EXPECTS(read_size != -1, "cuFile error reading from a file");
     return read_size;
   };
 
-  std::vector<std::future<ssize_t>> slice_tasks;
-  constexpr size_t max_slice_bytes = 4 * 1024 * 1024;
-  size_t n_slices                  = util::div_rounding_up_safe(size, max_slice_bytes);
-  size_t slice_size                = max_slice_bytes;
-  size_t slice_offset              = 0;
-  for (size_t t = 0; t < n_slices; ++t) {
-    void* dst_slice = dst + slice_offset;
+  auto slice_tasks = make_sliced_tasks(read_slice, dst, offset, size, pool);
 
-    if (t == n_slices - 1) { slice_size = size % max_slice_bytes; }
-    slice_tasks.push_back(pool.submit(read_slice, dst_slice, slice_size, offset + slice_offset));
-
-    slice_offset += slice_size;
-  }
-  auto waiter = [](decltype(slice_tasks) slice_tasks) -> size_t {
+  auto waiter = [](auto slice_tasks) -> size_t {
     return std::accumulate(slice_tasks.begin(), slice_tasks.end(), 0, [](auto sum, auto& task) {
       return sum + task.get();
     });
   };
-  // The future returned from this function is deferred, not async becasue we want to avoid creating
+  // The future returned from this function is deferred, not async because we want to avoid creating
   // threads for each read_async call. This overhead is significant in case of multiple small reads.
   return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
@@ -233,14 +243,42 @@ size_t cufile_input_impl::read(size_t offset,
 }
 
 cufile_output_impl::cufile_output_impl(std::string const& filepath)
-  : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664)
+  : shim{cufile_shim::instance()},
+    cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664),
+    pool(16)
 {
 }
 
 void cufile_output_impl::write(void const* data, size_t offset, size_t size)
 {
-  CUDF_EXPECTS(shim->write(cf_file.handle(), data, size, offset, 0) != -1,
-               "cuFile error writing to a file");
+  write_async(data, offset, size).wait();
+}
+
+std::future<void> cufile_output_impl::write_async(void const* data, size_t offset, size_t size)
+{
+  int device;
+  cudaGetDevice(&device);
+
+  auto write_slice = [device, gds_write = shim->write, file_handle = cf_file.handle()](
+                       void const* src, size_t size, size_t offset) -> void {
+    cudaSetDevice(device);
+    auto write_size = gds_write(file_handle, src, size, offset, 0);
+    CUDF_EXPECTS(write_size != -1 and write_size == static_cast<decltype(write_size)>(size),
+                 "cuFile error writing to a file");
+  };
+
+  auto source      = static_cast<uint8_t const*>(data);
+  auto slice_tasks = make_sliced_tasks(write_slice, source, offset, size, pool);
+
+  auto waiter = [](auto slice_tasks) -> void {
+    for (auto const& task : slice_tasks) {
+      task.wait();
+    }
+  };
+  // The future returned from this function is deferred, not async because we want to avoid creating
+  // threads for each write_async call. This overhead is significant in case of multiple small
+  // writes.
+  return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
 #endif
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index fdf8d012b0e..0119f9b7abd 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -142,6 +142,20 @@ class cufile_output : public cufile_io_base {
    * @param size Number of bytes to write
    */
   virtual void write(void const* data, size_t offset, size_t size) = 0;
+
+  /**
+   * @brief Asynchronously writes the data from a device buffer into a file.
+   *
+   * It is the caller's responsibility to not invalidate `data` until the result from this function
+   * is synchronized.
+   *
+   * @throws cudf::logic_error on cuFile error
+   *
+   * @param data Pointer to the buffer to be written into the output file
+   * @param offset Number of bytes from the start
+   * @param size Number of bytes to write
+   */
+  virtual std::future<void> write_async(void const* data, size_t offset, size_t size) = 0;
 };
 
 #ifdef CUFILE_FOUND
@@ -242,10 +256,12 @@ class cufile_output_impl final : public cufile_output {
   cufile_output_impl(std::string const& filepath);
 
   void write(void const* data, size_t offset, size_t size) override;
+  std::future<void> write_async(void const* data, size_t offset, size_t size) override;
 
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
+  cudf::detail::thread_pool pool;
 };
 #else
 
@@ -262,6 +278,14 @@ class cufile_input_impl final : public cufile_input {
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
   }
+
+  std::future<size_t> read_async(size_t offset,
+                                 size_t size,
+                                 uint8_t* dst,
+                                 rmm::cuda_stream_view stream) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
 };
 
 class cufile_output_impl final : public cufile_output {
@@ -270,6 +294,10 @@ class cufile_output_impl final : public cufile_output {
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
   }
+  std::future<void> write_async(void const* data, size_t offset, size_t size) override
+  {
+    CUDF_FAIL("Only used to compile without cufile library, should not be called");
+  }
 };
 #endif
 
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
index 7f0890549b2..2db87736848 100644
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ b/cpp/src/io/utilities/parsing_utils.cu
@@ -98,8 +98,8 @@ __global__ void count_and_set_positions(const char* data,
 }  // namespace
 
 template <class T>
-cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
-                                  const std::vector<char>& keys,
+cudf::size_type find_all_from_set(device_span<char const> data,
+                                  std::vector<char> const& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream)
@@ -108,31 +108,25 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
   int min_grid_size = 0;  // minimum block count required
   CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-  const int grid_size = divCeil(d_data.size(), (size_t)block_size);
+  const int grid_size = divCeil(data.size(), (size_t)block_size);
 
   auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
   for (char key : keys) {
-    count_and_set_positions<T>
-      <<<grid_size, block_size, 0, stream.value()>>>(static_cast<const char*>(d_data.data()),
-                                                     d_data.size(),
-                                                     result_offset,
-                                                     key,
-                                                     d_count.data(),
-                                                     positions);
+    count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
+      data.data(), data.size(), result_offset, key, d_count.data(), positions);
   }
 
   return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
 template <class T>
-cudf::size_type find_all_from_set(const char* h_data,
-                                  size_t h_size,
+cudf::size_type find_all_from_set(host_span<char const> data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream)
 {
-  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, h_size), stream);
+  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
   auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(1, stream);
 
   int block_size    = 0;  // suggested thread count to use
@@ -140,13 +134,13 @@ cudf::size_type find_all_from_set(const char* h_data,
   CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
 
-  const size_t chunk_count = divCeil(h_size, max_chunk_bytes);
+  const size_t chunk_count = divCeil(data.size(), max_chunk_bytes);
   for (size_t ci = 0; ci < chunk_count; ++ci) {
     const auto chunk_offset = ci * max_chunk_bytes;
-    const auto h_chunk      = h_data + chunk_offset;
-    const int chunk_bytes   = std::min((size_t)(h_size - ci * max_chunk_bytes), max_chunk_bytes);
-    const auto chunk_bits   = divCeil(chunk_bytes, bytes_per_find_thread);
-    const int grid_size     = divCeil(chunk_bits, block_size);
+    const auto h_chunk      = data.data() + chunk_offset;
+    const int chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
+    const auto chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
+    const int grid_size   = divCeil(chunk_bits, block_size);
 
     // Copy chunk to device
     CUDA_TRY(
@@ -166,45 +160,42 @@ cudf::size_type find_all_from_set(const char* h_data,
   return cudf::detail::make_std_vector_sync(d_count, stream)[0];
 }
 
-template cudf::size_type find_all_from_set<uint64_t>(const rmm::device_buffer& d_data,
-                                                     const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<uint64_t>(device_span<char const> data,
+                                                     std::vector<char> const& keys,
                                                      uint64_t result_offset,
                                                      uint64_t* positions,
                                                      rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<pos_key_pair>(const rmm::device_buffer& d_data,
-                                                         const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<pos_key_pair>(device_span<char const> data,
+                                                         std::vector<char> const& keys,
                                                          uint64_t result_offset,
                                                          pos_key_pair* positions,
                                                          rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<uint64_t>(const char* h_data,
-                                                     size_t h_size,
-                                                     const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<uint64_t>(host_span<char const> data,
+                                                     std::vector<char> const& keys,
                                                      uint64_t result_offset,
                                                      uint64_t* positions,
                                                      rmm::cuda_stream_view stream);
 
-template cudf::size_type find_all_from_set<pos_key_pair>(const char* h_data,
-                                                         size_t h_size,
-                                                         const std::vector<char>& keys,
+template cudf::size_type find_all_from_set<pos_key_pair>(host_span<char const> data,
+                                                         std::vector<char> const& keys,
                                                          uint64_t result_offset,
                                                          pos_key_pair* positions,
                                                          rmm::cuda_stream_view stream);
 
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
-                                   const std::vector<char>& keys,
+cudf::size_type count_all_from_set(device_span<char const> data,
+                                   std::vector<char> const& keys,
                                    rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(d_data, keys, 0, nullptr, stream);
+  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
 }
 
-cudf::size_type count_all_from_set(const char* h_data,
-                                   size_t h_size,
+cudf::size_type count_all_from_set(host_span<char const> data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream)
 {
-  return find_all_from_set<void>(h_data, h_size, keys, 0, nullptr, stream);
+  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
 }
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 19533c9fbdd..6da3296055c 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -68,7 +68,7 @@ struct parse_options {
   cudf::detail::optional_trie trie_na;
   bool multi_delimiter;
 
-  parse_options_view view()
+  parse_options_view view() const
   {
     return {delimiter,
             terminator,
@@ -391,8 +391,8 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
  * @return cudf::size_type total number of occurrences
  */
 template <class T>
-cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
-                                  const std::vector<char>& keys,
+cudf::size_type find_all_from_set(device_span<char const> data,
+                                  std::vector<char> const& keys,
                                   uint64_t result_offset,
                                   T* positions,
                                   rmm::cuda_stream_view stream);
@@ -415,8 +415,7 @@ cudf::size_type find_all_from_set(const rmm::device_buffer& d_data,
  * @return cudf::size_type total number of occurrences
  */
 template <class T>
-cudf::size_type find_all_from_set(const char* h_data,
-                                  size_t h_size,
+cudf::size_type find_all_from_set(host_span<char const> data,
                                   const std::vector<char>& keys,
                                   uint64_t result_offset,
                                   T* positions,
@@ -432,8 +431,8 @@ cudf::size_type find_all_from_set(const char* h_data,
  *
  * @return cudf::size_type total number of occurrences
  */
-cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
-                                   const std::vector<char>& keys,
+cudf::size_type count_all_from_set(device_span<char const> data,
+                                   std::vector<char> const& keys,
                                    rmm::cuda_stream_view stream);
 
 /**
@@ -450,8 +449,7 @@ cudf::size_type count_all_from_set(const rmm::device_buffer& d_data,
  *
  * @return cudf::size_type total number of occurrences
  */
-cudf::size_type count_all_from_set(const char* h_data,
-                                   size_t h_size,
+cudf::size_type count_all_from_set(host_span<char const> data,
                                    const std::vector<char>& keys,
                                    rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/src/io/utilities/thread_pool.hpp
index 33690c53758..c57082034db 100644
--- a/cpp/src/io/utilities/thread_pool.hpp
+++ b/cpp/src/io/utilities/thread_pool.hpp
@@ -207,35 +207,6 @@ class thread_pool {
     running = true;
   }
 
-  /**
-   * @brief Submit a function with zero or more arguments and no return value into the task queue,
-   * and get an std::future<bool> that will be set to true upon completion of the task.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to check if the function has finished its execution.
-   */
-  template <typename F,
-            typename... A,
-            typename = std::enable_if_t<
-              std::is_void_v<std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>>>
-  std::future<bool> submit(const F& task, const A&... args)
-  {
-    std::shared_ptr<std::promise<bool>> promise(new std::promise<bool>);
-    std::future<bool> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        task(args...);
-        promise->set_value(true);
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
   /**
    * @brief Submit a function with zero or more arguments and a return value into the task queue,
    * and get a future for its eventual returned value.
@@ -250,15 +221,19 @@ class thread_pool {
    */
   template <typename F,
             typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>,
-            typename   = std::enable_if_t<!std::is_void_v<R>>>
+            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
   std::future<R> submit(const F& task, const A&... args)
   {
     std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
     std::future<R> future = promise->get_future();
     push_task([task, args..., promise] {
       try {
-        promise->set_value(task(args...));
+        if constexpr (std::is_void_v<R>) {
+          task(args...);
+          promise->set_value();
+        } else {
+          promise->set_value(task(args...));
+        }
       } catch (...) {
         promise->set_exception(std::current_exception());
       };
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f4fe670d3cd..3992361ab1c 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -47,7 +47,9 @@ conditional_join(table_view const& left,
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
   // null index for the right table; in others, we return an empty output.
-  if (right.num_rows() == 0) {
+  auto right_num_rows{right.num_rows()};
+  auto left_num_rows{left.num_rows()};
+  if (right_num_rows == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
@@ -61,7 +63,7 @@ conditional_join(table_view const& left,
                               std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left.num_rows() == 0) {
+  } else if (left_num_rows == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -92,8 +94,10 @@ conditional_join(table_view const& left,
   auto left_table  = table_device_view::create(left, stream);
   auto right_table = table_device_view::create(right, stream);
 
-  // Allocate storage for the counter used to get the size of the join output
-  detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind kernel_join_type = join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
 
@@ -102,16 +106,27 @@ conditional_join(table_view const& left,
   if (output_size.has_value()) {
     join_size = *output_size;
   } else {
+    // Allocate storage for the counter used to get the size of the join output
     rmm::device_scalar<std::size_t> size(0, stream, mr);
     CHECK_CUDA(stream.value());
     if (has_nulls) {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_table, *right_table, kernel_join_type, parser.device_expression_data, size.data());
+          *left_table,
+          *right_table,
+          kernel_join_type,
+          parser.device_expression_data,
+          swap_tables,
+          size.data());
     } else {
       compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
         <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_table, *right_table, kernel_join_type, parser.device_expression_data, size.data());
+          *left_table,
+          *right_table,
+          kernel_join_type,
+          parser.device_expression_data,
+          swap_tables,
+          size.data());
     }
     CHECK_CUDA(stream.value());
     join_size = size.value(stream);
@@ -145,7 +160,8 @@ conditional_join(table_view const& left,
         join_output_r,
         write_index.data(),
         parser.device_expression_data,
-        join_size);
+        join_size,
+        swap_tables);
   } else {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -156,7 +172,8 @@ conditional_join(table_view const& left,
         join_output_r,
         write_index.data(),
         parser.device_expression_data,
-        join_size);
+        join_size,
+        swap_tables);
   }
 
   CHECK_CUDA(stream.value());
@@ -167,7 +184,7 @@ conditional_join(table_view const& left,
   // by any row in the left table.
   if (join_type == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
+      join_indices.second, left_num_rows, right_num_rows, stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
@@ -183,19 +200,21 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
-  if (right.num_rows() == 0) {
+  auto right_num_rows{right.num_rows()};
+  auto left_num_rows{left.num_rows()};
+  if (right_num_rows == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return left.num_rows();
+      case join_kind::FULL_JOIN: return left_num_rows;
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left.num_rows() == 0) {
+  } else if (left_num_rows == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -203,7 +222,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       // Full joins need to return the trivial complement.
-      case join_kind::FULL_JOIN: return right.num_rows();
+      case join_kind::FULL_JOIN: return right_num_rows;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   }
@@ -224,23 +243,38 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   auto left_table  = table_device_view::create(left, stream);
   auto right_table = table_device_view::create(right, stream);
 
+  // For inner joins we support optimizing the join by launching one thread for
+  // whichever table is larger rather than always using the left table.
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
+  detail::grid_1d config(swap_tables ? right_num_rows : left_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  assert(join_type != join_kind::FULL_JOIN);
+
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
   CHECK_CUDA(stream.value());
-  detail::grid_1d config(left_table->num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
-  assert(join_type != join_kind::FULL_JOIN);
   if (has_nulls) {
     compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table, *right_table, join_type, parser.device_expression_data, size.data());
+        *left_table,
+        *right_table,
+        join_type,
+        parser.device_expression_data,
+        swap_tables,
+        size.data());
   } else {
     compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_table, *right_table, join_type, parser.device_expression_data, size.data());
+        *left_table,
+        *right_table,
+        join_type,
+        parser.device_expression_data,
+        swap_tables,
+        size.data());
   }
   CHECK_CUDA(stream.value());
 
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 6dc204441b3..4aceb79a9aa 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -43,6 +43,8 @@ namespace detail {
  * @param[in] join_type The type of join to be performed
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  * @param[out] output_size The resulting output size
  */
 template <int block_size, bool has_nulls>
@@ -51,6 +53,7 @@ __global__ void compute_conditional_join_output_size(
   table_device_view right_table,
   join_kind join_type,
   ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
   std::size_t* output_size)
 {
   // The (required) extern storage of the shared memory array leads to
@@ -64,19 +67,23 @@ __global__ void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  cudf::size_type const left_start_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  cudf::size_type const left_stride    = blockDim.x * gridDim.x;
+  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::size_type const stride         = blockDim.x * gridDim.x;
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
-  for (cudf::size_type left_row_index = left_start_idx; left_row_index < left_num_rows;
-       left_row_index += left_stride) {
+  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
     bool found_match = false;
-    for (cudf::size_type right_row_index = 0; right_row_index < right_num_rows; right_row_index++) {
-      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    for (cudf::size_type inner_row_index = 0; inner_row_index < inner_num_rows; inner_row_index++) {
+      auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
+      auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;
       evaluator.evaluate(
         output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
       if (output_dest.is_valid() && output_dest.value()) {
@@ -122,6 +129,8 @@ __global__ void compute_conditional_join_output_size(
  * @param device_expression_data Container of device data required to evaluate the desired
  * expression.
  * @param[in] max_size The maximum size of the output
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
 __global__ void conditional_join(table_device_view left_table,
@@ -131,7 +140,8 @@ __global__ void conditional_join(table_device_view left_table,
                                  cudf::size_type* join_output_r,
                                  cudf::size_type* current_idx,
                                  cudf::ast::detail::expression_device_view device_expression_data,
-                                 cudf::size_type const max_size)
+                                 cudf::size_type const max_size,
+                                 bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
   __shared__ cudf::size_type current_idx_shared[num_warps];
@@ -152,22 +162,26 @@ __global__ void conditional_join(table_device_view left_table,
   int const lane_id                    = threadIdx.x % detail::warp_size;
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
+  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+  auto const inner_num_rows            = (swap_tables ? left_num_rows : right_num_rows);
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
   __syncwarp();
 
-  cudf::size_type left_row_index = threadIdx.x + blockIdx.x * blockDim.x;
+  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * blockDim.x;
 
-  unsigned int const activemask = __ballot_sync(0xffffffff, left_row_index < left_num_rows);
+  unsigned int const activemask = __ballot_sync(0xffffffff, outer_row_index < outer_num_rows);
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
-  if (left_row_index < left_num_rows) {
+  if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (size_type right_row_index(0); right_row_index < right_num_rows; ++right_row_index) {
-      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+    for (size_type inner_row_index(0); inner_row_index < inner_num_rows; ++inner_row_index) {
+      auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+      auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
+      auto const right_row_index = swap_tables ? outer_row_index : inner_row_index;
       evaluator.evaluate(
         output_dest, left_row_index, right_row_index, 0, thread_intermediate_storage);
 
@@ -215,7 +229,12 @@ __global__ void conditional_join(table_device_view left_table,
     if ((join_type == join_kind::LEFT_JOIN || join_type == join_kind::LEFT_ANTI_JOIN ||
          join_type == join_kind::FULL_JOIN) &&
         (!found_match)) {
-      add_pair_to_cache(left_row_index,
+      // TODO: This code assumes that swap_tables is false for all join
+      // kinds aside from inner joins. Once the code is generalized to handle
+      // other joins we'll want to switch the variable in the line below back
+      // to the left_row_index, but for now we can assume that they are
+      // equivalent inside this conditional.
+      add_pair_to_cache(outer_row_index,
                         static_cast<cudf::size_type>(JoinNoneValue),
                         current_idx_shared,
                         warp_id,
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 07ad2e052f1..b6fe18434eb 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,13 +13,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/uninitialized_fill.h>
 #include <join/hash_join.cuh>
-#include <structs/utilities.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -33,6 +34,26 @@
 namespace cudf {
 namespace detail {
 
+namespace {
+
+/**
+ * @brief Device functor to determine if a row is valid.
+ */
+class row_is_valid {
+ public:
+  row_is_valid(bitmask_type const* row_bitmask) : _row_bitmask{row_bitmask} {}
+
+  __device__ __inline__ bool operator()(const size_type& i) const noexcept
+  {
+    return cudf::bit_is_set(_row_bitmask, i);
+  }
+
+ private:
+  bitmask_type const* _row_bitmask;
+};
+
+}  // anonymous namespace
+
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
   table_view const& probe, table_view const& build)
 {
@@ -44,51 +65,39 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table
 /**
  * @brief Builds the hash table based on the given `build_table`.
  *
- * @throw cudf::logic_error if the number of columns in `build` table is 0.
- * @throw cudf::logic_error if the number of rows in `build` table is 0.
- * @throw cudf::logic_error if insertion to the hash table fails.
- *
  * @param build Table of columns used to build join hash.
+ * @param hash_table Build hash table.
  * @param compare_nulls Controls whether null join-key values should match or not.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  *
- * @return Built hash table.
  */
-std::unique_ptr<multimap_type, std::function<void(multimap_type*)>> build_join_hash_table(
-  cudf::table_view const& build, null_equality compare_nulls, rmm::cuda_stream_view stream)
+void build_join_hash_table(cudf::table_view const& build,
+                           multimap_type& hash_table,
+                           null_equality compare_nulls,
+                           rmm::cuda_stream_view stream)
 {
-  auto build_device_table = cudf::table_device_view::create(build, stream);
-
-  CUDF_EXPECTS(0 != build_device_table->num_columns(), "Selected build dataset is empty");
-  CUDF_EXPECTS(0 != build_device_table->num_rows(), "Build side table has no rows");
-
-  size_type const build_table_num_rows{build_device_table->num_rows()};
-  std::size_t const hash_table_size = compute_hash_table_size(build_table_num_rows);
-
-  auto hash_table = multimap_type::create(hash_table_size,
-                                          stream,
-                                          true,
-                                          multimap_type::hasher(),
-                                          multimap_type::key_equal(),
-                                          multimap_type::allocator_type());
-
-  row_hash hash_build{*build_device_table};
-  rmm::device_scalar<int> failure(0, stream);
-  constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  detail::grid_1d config(build_table_num_rows, block_size);
-  auto const row_bitmask = (compare_nulls == null_equality::EQUAL)
-                             ? rmm::device_buffer{0, stream}
-                             : cudf::detail::bitmask_and(build, stream);
-  build_hash_table<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-    *hash_table,
-    hash_build,
-    build_table_num_rows,
-    static_cast<bitmask_type const*>(row_bitmask.data()),
-    failure.data());
-  // Check error code from the kernel
-  if (failure.value(stream) == 1) { CUDF_FAIL("Hash Table insert failure."); }
-
-  return hash_table;
+  auto build_table_ptr = cudf::table_device_view::create(build, stream);
+
+  CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty");
+  CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows");
+
+  row_hash hash_build{*build_table_ptr};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_build, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  size_type const build_table_num_rows{build_table_ptr->num_rows()};
+  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
+    hash_table.insert(iter, iter + build_table_num_rows, stream.value());
+  } else {
+    thrust::counting_iterator<size_type> stencil(0);
+    auto const row_bitmask = cudf::detail::bitmask_and(build, stream);
+    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
+
+    // insert valid rows
+    hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value());
+  }
 }
 
 /**
@@ -123,8 +132,11 @@ probe_join_hash_table(cudf::table_device_view build_table,
   constexpr cudf::detail::join_kind ProbeJoinKind = (JoinKind == cudf::detail::join_kind::FULL_JOIN)
                                                       ? cudf::detail::join_kind::LEFT_JOIN
                                                       : JoinKind;
-  std::size_t const join_size = output_size.value_or(compute_join_output_size<ProbeJoinKind>(
-    build_table, probe_table, hash_table, compare_nulls, stream));
+
+  std::size_t const join_size = output_size
+                                  ? *output_size
+                                  : compute_join_output_size<ProbeJoinKind>(
+                                      build_table, probe_table, hash_table, compare_nulls, stream);
 
   // If output size is zero, return immediately
   if (join_size == 0) {
@@ -132,46 +144,37 @@ probe_join_hash_table(cudf::table_device_view build_table,
                           std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
-
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  detail::grid_1d config(probe_table.num_rows(), block_size);
+  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
 
   row_hash hash_probe{probe_table};
-  row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
-  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
-    probe_hash_table<cudf::detail::join_kind::LEFT_JOIN,
-                     multimap_type,
-                     block_size,
-                     DEFAULT_JOIN_CACHE_SIZE>
-      <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        hash_table,
-        build_table,
-        probe_table,
-        hash_probe,
-        equality,
-        left_indices->data(),
-        right_indices->data(),
-        write_index.data(),
-        join_size);
-    auto const actual_size = write_index.value(stream);
-    left_indices->resize(actual_size, stream);
-    right_indices->resize(actual_size, stream);
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  const cudf::size_type probe_table_num_rows = probe_table.num_rows();
+
+  auto out1_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin()));
+  auto out2_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin()));
+
+  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN or
+                JoinKind == cudf::detail::join_kind::LEFT_JOIN) {
+    [[maybe_unused]] auto [out1_zip_end, out2_zip_end] = hash_table.pair_retrieve_outer(
+      iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value());
+
+    if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+      auto const actual_size = out1_zip_end - out1_zip_begin;
+      left_indices->resize(actual_size, stream);
+      right_indices->resize(actual_size, stream);
+    }
   } else {
-    probe_hash_table<JoinKind, multimap_type, block_size, DEFAULT_JOIN_CACHE_SIZE>
-      <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        hash_table,
-        build_table,
-        probe_table,
-        hash_probe,
-        equality,
-        left_indices->data(),
-        right_indices->data(),
-        write_index.data(),
-        join_size);
+    hash_table.pair_retrieve(
+      iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value());
   }
   return std::make_pair(std::move(left_indices), std::move(right_indices));
 }
@@ -209,24 +212,24 @@ std::size_t get_full_join_size(cudf::table_device_view build_table,
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
-  constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  detail::grid_1d config(probe_table.num_rows(), block_size);
+  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
 
   row_hash hash_probe{probe_table};
-  row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
-  probe_hash_table<cudf::detail::join_kind::LEFT_JOIN,
-                   multimap_type,
-                   block_size,
-                   DEFAULT_JOIN_CACHE_SIZE>
-    <<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(hash_table,
-                                                                             build_table,
-                                                                             probe_table,
-                                                                             hash_probe,
-                                                                             equality,
-                                                                             left_indices->data(),
-                                                                             right_indices->data(),
-                                                                             write_index.data(),
-                                                                             join_size);
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
+
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+
+  const cudf::size_type probe_table_num_rows = probe_table.num_rows();
+
+  auto out1_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_discard_iterator(), left_indices->begin()));
+  auto out2_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(thrust::make_discard_iterator(), right_indices->begin()));
+
+  hash_table.pair_retrieve_outer(
+    iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value());
+
   // Release intermediate memory allocation
   left_indices->resize(0, stream);
 
@@ -286,22 +289,25 @@ hash_join::hash_join_impl::~hash_join_impl() = default;
 hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build,
                                           null_equality compare_nulls,
                                           rmm::cuda_stream_view stream)
-  : _hash_table(nullptr)
+  : _is_empty{build.num_rows() == 0},
+    _hash_table{compute_hash_table_size(build.num_rows()),
+                std::numeric_limits<hash_value_type>::max(),
+                cudf::detail::JoinNoneValue,
+                stream.value()}
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
   CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
                "Build column size is too big for hash join");
 
-  auto flattened_build = structs::detail::flatten_nested_columns(
-    build, {}, {}, structs::detail::column_nullability::FORCE);
-  _build = std::get<0>(flattened_build);
   // need to store off the owning structures for some of the views in _build
-  _created_null_columns = std::move(std::get<3>(flattened_build));
+  _flattened_build_table = structs::detail::flatten_nested_columns(
+    build, {}, {}, structs::detail::column_nullability::FORCE);
+  _build = _flattened_build_table;
 
-  if (0 == build.num_rows()) { return; }
+  if (_is_empty) { return; }
 
-  _hash_table = build_join_hash_table(_build, compare_nulls, stream);
+  build_join_hash_table(_build, _hash_table, compare_nulls, stream);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -350,17 +356,17 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p
   CUDF_FUNC_RANGE();
 
   // Return directly if build table is empty
-  if (_hash_table == nullptr) { return 0; }
+  if (_is_empty) { return 0; }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
-  auto const flattened_probe_table = std::get<0>(flattened_probe);
+  auto const flattened_probe_table = flattened_probe.flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::INNER_JOIN>(
-    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe,
@@ -370,17 +376,17 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr
   CUDF_FUNC_RANGE();
 
   // Trivial left join case - exit early
-  if (_hash_table == nullptr) { return probe.num_rows(); }
+  if (_is_empty) { return probe.num_rows(); }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
-  auto const flattened_probe_table = std::get<0>(flattened_probe);
+  auto const flattened_probe_table = flattened_probe.flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return cudf::detail::compute_join_output_size<cudf::detail::join_kind::LEFT_JOIN>(
-    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream);
+    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream);
 }
 
 std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe,
@@ -391,17 +397,17 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr
   CUDF_FUNC_RANGE();
 
   // Trivial left join case - exit early
-  if (_hash_table == nullptr) { return probe.num_rows(); }
+  if (_is_empty) { return probe.num_rows(); }
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
-  auto const flattened_probe_table = std::get<0>(flattened_probe);
+  auto const flattened_probe_table = flattened_probe.flattened_columns();
 
   auto build_table_ptr           = cudf::table_device_view::create(_build, stream);
   auto flattened_probe_table_ptr = cudf::table_device_view::create(flattened_probe_table, stream);
 
   return get_full_join_size(
-    *build_table_ptr, *flattened_probe_table_ptr, *_hash_table, compare_nulls, stream, mr);
+    *build_table_ptr, *flattened_probe_table_ptr, _hash_table, compare_nulls, stream, mr);
 }
 
 template <cudf::detail::join_kind JoinKind>
@@ -419,7 +425,7 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe,
 
   auto flattened_probe = structs::detail::flatten_nested_columns(
     probe, {}, {}, structs::detail::column_nullability::FORCE);
-  auto const flattened_probe_table = std::get<0>(flattened_probe);
+  auto const flattened_probe_table = flattened_probe.flattened_columns();
 
   CUDF_EXPECTS(_build.num_columns() == flattened_probe_table.num_columns(),
                "Mismatch in number of columns to be joined on");
@@ -450,19 +456,19 @@ hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe,
                                               rmm::mr::device_memory_resource* mr) const
 {
   // Trivial left join case - exit early
-  if (_hash_table == nullptr and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
+  if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) {
     return get_trivial_left_join_indices(probe, stream, mr);
   }
 
-  CUDF_EXPECTS(_hash_table, "Hash table of hash join is null.");
+  CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null.");
 
-  auto build_table = cudf::table_device_view::create(_build, stream);
-  auto probe_table = cudf::table_device_view::create(probe, stream);
+  auto build_table_ptr = cudf::table_device_view::create(_build, stream);
+  auto probe_table_ptr = cudf::table_device_view::create(probe, stream);
 
   auto join_indices = cudf::detail::probe_join_hash_table<JoinKind>(
-    *build_table, *probe_table, *_hash_table, compare_nulls, output_size, stream, mr);
+    *build_table_ptr, *probe_table_ptr, _hash_table, compare_nulls, output_size, stream, mr);
 
-  if (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
+  if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
       join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
index 8951d8b3aca..10b0e420ef6 100644
--- a/cpp/src/join/hash_join.cuh
+++ b/cpp/src/join/hash_join.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 #pragma once
 
-#include <cudf/detail/concatenate.cuh>
 #include <join/join_common_utils.cuh>
 #include <join/join_common_utils.hpp>
-#include <join/join_kernels.cuh>
 
+#include <cudf/detail/concatenate.cuh>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -37,12 +38,47 @@
 
 namespace cudf {
 namespace detail {
+
+/**
+ * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
+ *
+ * @param hash The hash value to potentially remap
+ * @param sentinel The reserved value
+ */
+template <typename H, typename S>
+constexpr auto remap_sentinel_hash(H hash, S sentinel)
+{
+  // Arbitrarily choose hash - 1
+  return (hash == sentinel) ? (hash - 1) : hash;
+}
+
+/**
+ * @brief Device functor to create a pair of hash value and index for a given row.
+ */
+class make_pair_function {
+ public:
+  make_pair_function(row_hash const& hash, hash_value_type const empty_key_sentinel)
+    : _hash{hash}, _empty_key_sentinel{empty_key_sentinel}
+  {
+  }
+
+  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
+  {
+    // Compute the hash value of row `i`
+    auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
+    return cuco::make_pair<hash_value_type, size_type>(std::move(row_hash_value), std::move(i));
+  }
+
+ private:
+  row_hash _hash;
+  hash_value_type const _empty_key_sentinel;
+};
+
 /**
  * @brief Calculates the exact size of the join output produced when
  * joining two tables together.
  *
  * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN
- * @throw cudf::logic_error if the exact size overflows cudf::size_type
  *
  * @tparam JoinKind The type of join to be performed
  * @tparam multimap_type The type of the hash table
@@ -81,41 +117,22 @@ std::size_t compute_join_output_size(table_device_view build_table,
     }
   }
 
-  // Allocate storage for the counter used to get the size of the join output
-  std::size_t h_size{0};
-  rmm::device_scalar<std::size_t> d_size(0, stream);
-
-  CHECK_CUDA(stream.value());
-
-  constexpr int block_size{DEFAULT_JOIN_BLOCK_SIZE};
-  int numBlocks{-1};
+  pair_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
 
-  CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocks, compute_join_output_size<JoinKind, multimap_type, block_size>, block_size, 0));
+  row_hash hash_probe{probe_table};
+  auto const empty_key_sentinel = hash_table.get_empty_key_sentinel();
+  make_pair_function pair_func{hash_probe, empty_key_sentinel};
 
-  int dev_id{-1};
-  CUDA_TRY(cudaGetDevice(&dev_id));
+  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
 
-  int num_sms{-1};
-  CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  std::size_t size;
+  if constexpr (JoinKind == join_kind::LEFT_JOIN) {
+    size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value());
+  } else {
+    size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value());
+  }
 
-  row_hash hash_probe{probe_table};
-  row_equality equality{probe_table, build_table, compare_nulls == null_equality::EQUAL};
-  // Probe the hash table without actually building the output to simply
-  // find what the size of the output will be.
-  compute_join_output_size<JoinKind, multimap_type, block_size>
-    <<<numBlocks * num_sms, block_size, 0, stream.value()>>>(hash_table,
-                                                             build_table,
-                                                             probe_table,
-                                                             hash_probe,
-                                                             equality,
-                                                             probe_table_num_rows,
-                                                             d_size.data());
-
-  CHECK_CUDA(stream.value());
-  h_size = d_size.value(stream);
-
-  return h_size;
+  return size;
 }
 
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
@@ -136,10 +153,11 @@ struct hash_join::hash_join_impl {
   hash_join_impl& operator=(hash_join_impl&&) = delete;
 
  private:
+  bool _is_empty;
   cudf::table_view _build;
   std::vector<std::unique_ptr<cudf::column>> _created_null_columns;
-  std::unique_ptr<cudf::detail::multimap_type, std::function<void(cudf::detail::multimap_type*)>>
-    _hash_table;
+  cudf::structs::detail::flattened_table _flattened_build_table;
+  cudf::detail::multimap_type _hash_table;
 
  public:
   /**
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index d5c23b1d612..cec633765c7 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -27,6 +27,26 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Device functor to determine if two pairs are identical.
+ */
+class pair_equality {
+ public:
+  pair_equality(table_device_view lhs, table_device_view rhs, bool nulls_are_equal = true)
+    : _check_row_equality{lhs, rhs, nulls_are_equal}
+  {
+  }
+
+  __device__ __forceinline__ bool operator()(const pair_type& lhs,
+                                             const pair_type& rhs) const noexcept
+  {
+    return lhs.first == rhs.first and _check_row_equality(rhs.second, lhs.second);
+  }
+
+ private:
+  row_equality _check_row_equality;
+};
+
 /**
  * @brief Computes the trivial left join operation for the case when the
  * right table is empty.
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index d2541b006a7..d6eb5e93a98 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -21,25 +21,29 @@
 
 #include <hash/concurrent_unordered_multimap.cuh>
 
+#include <cuco/static_multimap.cuh>
+
 #include <limits>
 
 namespace cudf {
 namespace detail {
 constexpr size_type MAX_JOIN_SIZE{std::numeric_limits<size_type>::max()};
 
+constexpr int DEFAULT_JOIN_CG_SIZE    = 2;
 constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128;
 constexpr int DEFAULT_JOIN_CACHE_SIZE = 128;
 constexpr size_type JoinNoneValue     = std::numeric_limits<size_type>::min();
 
+using pair_type = cuco::pair_type<hash_value_type, size_type>;
+
+using hash_type = cuco::detail::MurmurHash3_32<hash_value_type>;
+
 using multimap_type =
-  concurrent_unordered_multimap<hash_value_type,
-                                size_type,
-                                size_t,
-                                std::numeric_limits<hash_value_type>::max(),
-                                std::numeric_limits<size_type>::max(),
-                                default_hash<hash_value_type>,
-                                equal_to<hash_value_type>,
-                                default_allocator<thrust::pair<hash_value_type, size_type>>>;
+  cuco::static_multimap<hash_value_type,
+                        size_type,
+                        cuda::thread_scope_device,
+                        default_allocator<char>,
+                        cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
 
 using row_hash = cudf::row_hasher<default_hash>;
 
diff --git a/cpp/src/join/join_kernels.cuh b/cpp/src/join/join_kernels.cuh
deleted file mode 100644
index 62bc7d0fe80..00000000000
--- a/cpp/src/join/join_kernels.cuh
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- * Copyright (c) 2018-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <join/join_common_utils.cuh>
-#include <join/join_common_utils.hpp>
-
-#include <cudf/table/table_device_view.cuh>
-
-#include <thrust/pair.h>
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace detail {
-/**
- * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value.
- *
- * @param hash The hash value to potentially remap
- * @param sentinel The reserved value
- */
-template <typename H, typename S>
-constexpr auto remap_sentinel_hash(H hash, S sentinel)
-{
-  // Arbitrarily choose hash - 1
-  return (hash == sentinel) ? (hash - 1) : hash;
-}
-
-/**
- * @brief Builds a hash table from a row hasher that maps the hash
- * values of each row to its respective row index.
- *
- * @tparam multimap_type The type of the hash table
- *
- * @param[in,out] multi_map The hash table to be built to insert rows into
- * @param[in] hash_build Row hasher for the build table
- * @param[in] build_table_num_rows The number of rows in the build table
- * @param[in] row_bitmask Bitmask where bit `i` indicates the presence of a null
- * value in row `i` of input keys. This is nullptr if nulls are equal.
- * @param[out] error Pointer used to set an error code if the insert fails
- */
-template <typename multimap_type>
-__global__ void build_hash_table(multimap_type multi_map,
-                                 row_hash hash_build,
-                                 const cudf::size_type build_table_num_rows,
-                                 bitmask_type const* row_bitmask,
-                                 int* error)
-{
-  cudf::size_type i = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (i < build_table_num_rows) {
-    if (!row_bitmask || cudf::bit_is_set(row_bitmask, i)) {
-      // Compute the hash value of this row
-      auto const row_hash_value = remap_sentinel_hash(hash_build(i), multi_map.get_unused_key());
-
-      // Insert the (row hash value, row index) into the map
-      // using the row hash value to determine the location in the
-      // hash map where the new pair should be inserted
-      auto const insert_location =
-        multi_map.insert(thrust::make_pair(row_hash_value, i), true, row_hash_value);
-
-      // If the insert failed, set the error code accordingly
-      if (multi_map.end() == insert_location) { *error = 1; }
-    }
-    i += blockDim.x * gridDim.x;
-  }
-}
-
-/**
- * @brief Computes the output size of joining the probe table to the build table
- * by probing the hash map with the probe table and counting the number of matches.
- *
- * @tparam JoinKind The type of join to be performed
- * @tparam multimap_type The datatype of the hash table
- * @tparam block_size The number of threads per block for this kernel
- *
- * @param[in] multi_map The hash table built on the build table
- * @param[in] build_table The build table
- * @param[in] probe_table The probe table
- * @param[in] hash_probe Row hasher for the probe table
- * @param[in] check_row_equality The row equality comparator
- * @param[in] probe_table_num_rows The number of rows in the probe table
- * @param[out] output_size The resulting output size
- */
-template <join_kind JoinKind, typename multimap_type, int block_size>
-__global__ void compute_join_output_size(multimap_type multi_map,
-                                         table_device_view build_table,
-                                         table_device_view probe_table,
-                                         row_hash hash_probe,
-                                         row_equality check_row_equality,
-                                         const cudf::size_type probe_table_num_rows,
-                                         std::size_t* output_size)
-{
-  // This kernel probes multiple elements in the probe_table and store the number of matches found
-  // inside a register. A block reduction is used at the end to calculate the matches per thread
-  // block, and atomically add to the global 'output_size'. Compared to probing one element per
-  // thread, this implementation improves performance by reducing atomic adds to the shared memory
-  // counter.
-
-  cudf::size_type thread_counter{0};
-  const cudf::size_type start_idx = threadIdx.x + blockIdx.x * blockDim.x;
-  const cudf::size_type stride    = blockDim.x * gridDim.x;
-  const auto unused_key           = multi_map.get_unused_key();
-  const auto end                  = multi_map.end();
-
-  for (cudf::size_type probe_row_index = start_idx; probe_row_index < probe_table_num_rows;
-       probe_row_index += stride) {
-    // Search the hash map for the hash value of the probe row using the row's
-    // hash value to determine the location where to search for the row in the hash map
-    auto const probe_row_hash_value = remap_sentinel_hash(hash_probe(probe_row_index), unused_key);
-
-    auto found = multi_map.find(probe_row_hash_value, true, probe_row_hash_value);
-
-    // for left-joins we always need to add an output
-    bool running     = (JoinKind == join_kind::LEFT_JOIN) || (end != found);
-    bool found_match = false;
-
-    while (running) {
-      // TODO Simplify this logic...
-
-      // Left joins always have an entry in the output
-      if (JoinKind == join_kind::LEFT_JOIN && (end == found)) {
-        running = false;
-      }
-      // Stop searching after encountering an empty hash table entry
-      else if (unused_key == found->first) {
-        running = false;
-      }
-      // First check that the hash values of the two rows match
-      else if (found->first == probe_row_hash_value) {
-        // If the hash values are equal, check that the rows are equal
-        if (check_row_equality(probe_row_index, found->second)) {
-          // If the rows are equal, then we have found a true match
-          found_match = true;
-          ++thread_counter;
-        }
-        // Continue searching for matching rows until you hit an empty hash map entry
-        ++found;
-        // If you hit the end of the hash map, wrap around to the beginning
-        if (end == found) found = multi_map.begin();
-        // Next entry is empty, stop searching
-        if (unused_key == found->first) running = false;
-      } else {
-        // Continue searching for matching rows until you hit an empty hash table entry
-        ++found;
-        // If you hit the end of the hash map, wrap around to the beginning
-        if (end == found) found = multi_map.begin();
-        // Next entry is empty, stop searching
-        if (unused_key == found->first) running = false;
-      }
-
-      if ((JoinKind == join_kind::LEFT_JOIN) && (!running) && (!found_match)) { ++thread_counter; }
-    }
-  }
-
-  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
-
-  // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
-}
-
-/**
- * @brief Probes the hash map with the probe table to find all matching rows
- * between the probe and hash table and generate the output for the desired
- * Join operation.
- *
- * @tparam JoinKind The type of join to be performed
- * @tparam multimap_type The type of the hash table
- * @tparam block_size The number of threads per block for this kernel
- * @tparam output_cache_size The side of the shared memory buffer to cache join output results
- *
- * @param[in] multi_map The hash table built from the build table
- * @param[in] build_table The build table
- * @param[in] probe_table The probe table
- * @param[in] hash_probe Row hasher for the probe table
- * @param[in] check_row_equality The row equality comparator
- * @param[out] join_output_l The left result of the join operation
- * @param[out] join_output_r The right result of the join operation
- * @param[in,out] current_idx A global counter used by threads to coordinate writes to the global
- output
- * @param[in] max_size The maximum size of the output
- */
-template <join_kind JoinKind,
-          typename multimap_type,
-          cudf::size_type block_size,
-          cudf::size_type output_cache_size>
-__global__ void probe_hash_table(multimap_type multi_map,
-                                 table_device_view build_table,
-                                 table_device_view probe_table,
-                                 row_hash hash_probe,
-                                 row_equality check_row_equality,
-                                 size_type* join_output_l,
-                                 size_type* join_output_r,
-                                 cudf::size_type* current_idx,
-                                 const std::size_t max_size)
-{
-  constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ size_type current_idx_shared[num_warps];
-  __shared__ size_type join_shared_l[num_warps][output_cache_size];
-  __shared__ size_type join_shared_r[num_warps][output_cache_size];
-
-  const int warp_id                          = threadIdx.x / detail::warp_size;
-  const int lane_id                          = threadIdx.x % detail::warp_size;
-  const cudf::size_type probe_table_num_rows = probe_table.num_rows();
-
-  if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
-
-  __syncwarp();
-
-  size_type probe_row_index = threadIdx.x + blockIdx.x * blockDim.x;
-
-  const unsigned int activemask = __ballot_sync(0xffffffff, probe_row_index < probe_table_num_rows);
-  if (probe_row_index < probe_table_num_rows) {
-    const auto unused_key = multi_map.get_unused_key();
-    const auto end        = multi_map.end();
-
-    // Search the hash map for the hash value of the probe row using the row's
-    // hash value to determine the location where to search for the row in the hash map
-    auto const probe_row_hash_value = remap_sentinel_hash(hash_probe(probe_row_index), unused_key);
-
-    auto found = multi_map.find(probe_row_hash_value, true, probe_row_hash_value);
-
-    bool running = (JoinKind == join_kind::LEFT_JOIN) ||
-                   (end != found);  // for left-joins we always need to add an output
-    bool found_match = false;
-    while (__any_sync(activemask, running)) {
-      if (running) {
-        // TODO Simplify this logic...
-
-        // Left joins always have an entry in the output
-        if ((JoinKind == join_kind::LEFT_JOIN) && (end == found)) {
-          running = false;
-        }
-        // Stop searching after encountering an empty hash table entry
-        else if (unused_key == found->first) {
-          running = false;
-        }
-        // First check that the hash values of the two rows match
-        else if (found->first == probe_row_hash_value) {
-          // If the hash values are equal, check that the rows are equal
-          // TODO : REMOVE : if(row_equal{probe_table, build_table}(probe_row_index, found->second))
-          if (check_row_equality(probe_row_index, found->second)) {
-            // If the rows are equal, then we have found a true match
-            found_match = true;
-            add_pair_to_cache(probe_row_index,
-                              found->second,
-                              current_idx_shared,
-                              warp_id,
-                              join_shared_l[warp_id],
-                              join_shared_r[warp_id]);
-          }
-          // Continue searching for matching rows until you hit an empty hash map entry
-          ++found;
-          // If you hit the end of the hash map, wrap around to the beginning
-          if (end == found) found = multi_map.begin();
-          // Next entry is empty, stop searching
-          if (unused_key == found->first) running = false;
-        } else {
-          // Continue searching for matching rows until you hit an empty hash table entry
-          ++found;
-          // If you hit the end of the hash map, wrap around to the beginning
-          if (end == found) found = multi_map.begin();
-          // Next entry is empty, stop searching
-          if (unused_key == found->first) running = false;
-        }
-
-        // If performing a LEFT join and no match was found, insert a Null into the output
-        if ((JoinKind == join_kind::LEFT_JOIN) && (!running) && (!found_match)) {
-          add_pair_to_cache(probe_row_index,
-                            static_cast<size_type>(JoinNoneValue),
-                            current_idx_shared,
-                            warp_id,
-                            join_shared_l[warp_id],
-                            join_shared_r[warp_id]);
-        }
-      }
-
-      __syncwarp(activemask);
-      // flush output cache if next iteration does not fit
-      if (current_idx_shared[warp_id] + detail::warp_size >= output_cache_size) {
-        flush_output_cache<num_warps, output_cache_size>(activemask,
-                                                         max_size,
-                                                         warp_id,
-                                                         lane_id,
-                                                         current_idx,
-                                                         current_idx_shared,
-                                                         join_shared_l,
-                                                         join_shared_r,
-                                                         join_output_l,
-                                                         join_output_r);
-        __syncwarp(activemask);
-        if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
-        __syncwarp(activemask);
-      }
-    }
-
-    // final flush of output cache
-    if (current_idx_shared[warp_id] > 0) {
-      flush_output_cache<num_warps, output_cache_size>(activemask,
-                                                       max_size,
-                                                       warp_id,
-                                                       lane_id,
-                                                       current_idx,
-                                                       current_idx_shared,
-                                                       join_shared_l,
-                                                       join_shared_r,
-                                                       join_output_l,
-                                                       join_output_r);
-    }
-  }
-}
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 69a7b8c722b..4a2f46d6f43 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -16,12 +16,12 @@
 
 #include <hash/concurrent_unordered_map.cuh>
 #include <join/join_common_utils.hpp>
-#include <structs/utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table.hpp>
@@ -68,8 +68,8 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   auto left_flattened_tables = structs::detail::flatten_nested_columns(
     left_keys, {}, {}, structs::detail::column_nullability::FORCE);
 
-  auto right_flattened_keys = std::get<0>(right_flattened_tables);
-  auto left_flattened_keys  = std::get<0>(left_flattened_tables);
+  auto right_flattened_keys = right_flattened_tables.flattened_columns();
+  auto left_flattened_keys  = left_flattened_tables.flattened_columns();
 
   // Only care about existence, so we'll use an unordered map (other joins need a multimap)
   using hash_table_type = concurrent_unordered_map<cudf::size_type, bool, row_hash, row_equality>;
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 66b5bb98dbf..5007c3028ad 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -213,7 +213,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                "The left and right edge columns cannot contain nulls.");
 
   // Handle empty inputs.
-  if (input.is_empty()) { return make_empty_column(data_type(type_to_id<size_type>())); }
+  if (input.is_empty()) { return make_empty_column(type_to_id<size_type>()); }
 
   return type_dispatcher<dispatch_storage_type>(input.type(),
                                                 detail::bin_type_dispatcher{},
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 3cd8a23d0fc..5916837f97a 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -224,7 +224,7 @@ struct list_child_constructor {
     auto const num_child_rows{
       cudf::detail::get_value<size_type>(list_offsets, list_offsets.size() - 1, stream)};
 
-    if (num_child_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
+    if (num_child_rows == 0) { return make_empty_column(type_id::STRING); }
 
     auto string_views = rmm::device_uvector<string_view>(num_child_rows, stream);
 
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 77d41c5ddc9..8cbcddc1c58 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -45,6 +45,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   auto const gather_index_begin      = gather_map.offsets_begin() + 1;
   auto const gather_index_end        = gather_map.offsets_end();
   auto const value_offsets           = value_column.offsets_begin();
+  auto const value_device_view       = column_device_view::create(value_column.parent(), stream);
   auto const map_begin =
     cudf::detail::indexalator_factory::make_input_iterator(gather_map_sliced_child);
   auto const out_of_bounds = [] __device__(auto const index, auto const list_size) {
@@ -52,7 +53,8 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
   };
 
   // Calculate Flattened gather indices  (value_offset[row]+sub_index
-  auto transformer = [value_offsets,
+  auto transformer = [values_lists_view = *value_device_view,
+                      value_offsets,
                       map_begin,
                       gather_index_begin,
                       gather_index_end,
@@ -64,8 +66,9 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
         thrust::seq, gather_index_begin, gather_index_end, gather_index_begin[-1] + index) -
       gather_index_begin;
     // Get each sub_index in list in each row of gather_map.
-    auto sub_index          = map_begin[index];
-    auto list_size          = value_offsets[offset_idx + 1] - value_offsets[offset_idx];
+    auto sub_index    = map_begin[index];
+    auto list_is_null = values_lists_view.is_null(offset_idx);
+    auto list_size = list_is_null ? 0 : (value_offsets[offset_idx + 1] - value_offsets[offset_idx]);
     auto wrapped_sub_index  = sub_index < 0 ? sub_index + list_size : sub_index;
     auto constexpr null_idx = cuda::std::numeric_limits<cudf::size_type>::max();
     // Add sub_index to value_column offsets, to get gather indices of child of value_column
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
index c547ca14f2d..a4afa2d1055 100644
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ b/cpp/src/lists/drop_list_duplicates.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <structs/utilities.hpp>
+#include <stream_compaction/drop_duplicates.cuh>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
@@ -22,7 +22,9 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/lists/detail/sorting.hpp>
+#include <cudf/detail/replace.hpp>
+#include <cudf/detail/sorting.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/lists/drop_list_duplicates.hpp>
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -34,28 +36,29 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
-#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scatter.h>
 #include <thrust/transform.h>
+#include <thrust/uninitialized_fill.h>
 
-namespace cudf {
-namespace lists {
+#include <optional>
+
+namespace cudf::lists {
 namespace detail {
+
 namespace {
 template <typename Type>
 struct has_negative_nans_fn {
-  column_device_view const d_entries;
-  bool const has_nulls;
+  column_device_view const d_view;
 
-  has_negative_nans_fn(column_device_view const d_entries, bool const has_nulls)
-    : d_entries(d_entries), has_nulls(has_nulls)
-  {
-  }
+  has_negative_nans_fn(column_device_view const& d_view) : d_view(d_view) {}
 
   __device__ Type operator()(size_type idx) const noexcept
   {
-    if (has_nulls && d_entries.is_null_nocheck(idx)) { return false; }
+    if (d_view.is_null(idx)) { return false; }
 
-    auto const val = d_entries.element<Type>(idx);
+    auto const val = d_view.element<Type>(idx);
     return std::isnan(val) && std::signbit(val);  // std::signbit(x) == true if x is negative
   }
 };
@@ -64,36 +67,32 @@ struct has_negative_nans_fn {
  * @brief A structure to be used along with type_dispatcher to check if a column has any
  * negative NaN value.
  *
- * This functor is used to check for replacing negative NaN if there exists one. It is neccessary
- * because when calling to `lists::detail::sort_lists`, the negative NaN and positive NaN values (if
- * both exist) are separated to the two ends of the output column. This is due to the API
- * `lists::detail::sort_lists` internally calls `cub::DeviceSegmentedRadixSort`, which performs
- * sorting by comparing bits of the input numbers. Since negative and positive NaN have
- * different bits representation, they may not be moved to be close to each other after sorted.
+ * This functor is neccessary because when calling to segmented sort on the list entries, the
+ * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
+ * lists. We want to move all NaN values close together in order to call unique_copy later on.
  */
 struct has_negative_nans_dispatch {
   template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
-  bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const noexcept
+  bool operator()(column_view const& input, rmm::cuda_stream_view stream) const noexcept
   {
-    auto const d_entries = column_device_view::create(lists_entries, stream);
-    return thrust::count_if(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(lists_entries.size()),
-      detail::has_negative_nans_fn<Type>{*d_entries, lists_entries.has_nulls()});
+    auto const d_entries_ptr = column_device_view::create(input, stream);
+    return thrust::count_if(rmm::exec_policy(stream),
+                            thrust::make_counting_iterator(0),
+                            thrust::make_counting_iterator(input.size()),
+                            has_negative_nans_fn<Type>{*d_entries_ptr});
   }
 
   template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  bool operator()(column_view const& lists_entries, rmm::cuda_stream_view stream) const
+  bool operator()(column_view const& input, rmm::cuda_stream_view stream) const
   {
     // Recursively check negative NaN on the children columns.
-    return std::any_of(
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(lists_entries.num_children()),
-      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
-        auto const col = structs_view.get_sliced_child(child_idx);
-        return type_dispatcher(col.type(), detail::has_negative_nans_dispatch{}, col, stream);
-      });
+    return std::any_of(thrust::make_counting_iterator(0),
+                       thrust::make_counting_iterator(input.num_children()),
+                       [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+                         auto const col = structs_view.get_sliced_child(child_idx);
+                         return type_dispatcher(
+                           col.type(), has_negative_nans_dispatch{}, col, stream);
+                       });
   }
 
   template <typename Type,
@@ -101,204 +100,119 @@ struct has_negative_nans_dispatch {
                              !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
   bool operator()(column_view const&, rmm::cuda_stream_view) const
   {
-    // Columns of non floating-point data will never contain NaN.
+    // Non-nested columns of non floating-point data do not contain NaN.
+    // Nested columns (not STRUCT) are not supported and should not reach this point.
     return false;
   }
 };
 
-template <typename Type>
-struct replace_negative_nans_fn {
-  __device__ Type operator()(Type val) const noexcept
-  {
-    return std::isnan(val) ? std::numeric_limits<Type>::quiet_NaN() : val;
-  }
-};
-
 /**
- * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for all rows
- * in a floating-point data column.
+ * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for a
+ * floating-point data column.
+ *
+ * Replacing -NaN by NaN is necessary before calling to segmented sort for lists because the sorting
+ * API may separate -NaN and NaN to the two ends of each result list while we want to group all NaN
+ * together.
  */
 struct replace_negative_nans_dispatch {
-  template <typename Type,
-            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
-                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& lists_entries,
-                                     rmm::cuda_stream_view) const noexcept
-  {
-    // For non floating point type and non struct, just return a copy of the input.
-    return std::make_unique<column>(lists_entries);
-  }
-
-  template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& lists_entries,
+  template <typename Type, std::enable_if_t<!std::is_same_v<Type, cudf::struct_view>>* = nullptr>
+  std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream) const noexcept
   {
-    auto new_entries = cudf::detail::allocate_like(
-      lists_entries, lists_entries.size(), cudf::mask_allocation_policy::NEVER, stream);
-    new_entries->set_null_mask(cudf::detail::copy_bitmask(lists_entries, stream),
-                               lists_entries.null_count());
-
-    // Replace all negative NaN values.
-    thrust::transform(rmm::exec_policy(stream),
-                      lists_entries.template begin<Type>(),
-                      lists_entries.template end<Type>(),
-                      new_entries->mutable_view().template begin<Type>(),
-                      detail::replace_negative_nans_fn<Type>{});
-
-    return new_entries;
+    return cuda::std::is_floating_point_v<Type>
+             ? cudf::detail::normalize_nans_and_zeros(input, stream)
+             : std::make_unique<column>(input, stream);
   }
 
   template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& lists_entries,
+  std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream) const noexcept
   {
     std::vector<std::unique_ptr<cudf::column>> output_struct_members;
-    std::transform(
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(lists_entries.num_children()),
-      std::back_inserter(output_struct_members),
-      [structs_view = structs_column_view{lists_entries}, stream](auto const child_idx) {
-        auto const col = structs_view.get_sliced_child(child_idx);
-        return type_dispatcher(col.type(), detail::replace_negative_nans_dispatch{}, col, stream);
-      });
-
-    return cudf::make_structs_column(lists_entries.size(),
+    std::transform(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(input.num_children()),
+                   std::back_inserter(output_struct_members),
+                   [structs_view = structs_column_view{input}, stream](auto const child_idx) {
+                     auto const col = structs_view.get_sliced_child(child_idx);
+                     return type_dispatcher(
+                       col.type(), replace_negative_nans_dispatch{}, col, stream);
+                   });
+
+    return cudf::make_structs_column(input.size(),
                                      std::move(output_struct_members),
-                                     lists_entries.null_count(),
-                                     cudf::detail::copy_bitmask(lists_entries, stream),
+                                     input.null_count(),
+                                     cudf::detail::copy_bitmask(input, stream),
                                      stream);
   }
 };
 
 /**
- * @brief Generate a 0-based offset column for a lists column.
+ * @brief Populate 1-based list indices for all list entries.
  *
- * Given a lists_column_view, which may have a non-zero offset, generate a new column containing
- * 0-based list offsets. This is done by subtracting each of the input list offset by the first
- * offset.
+ * Given a number of total list entries in a lists column and an array containing list offsets,
+ * generate an array that maps each list entry to a 1-based index of the list containing
+ * that entry.
  *
- * @code{.pseudo}
- * Given a list column having offsets = { 3, 7, 9, 13 },
- * then output_offsets = { 0, 4, 6, 10 }
- * @endcode
- *
- * @param lists_column The input lists column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device resource used to allocate memory.
- * @return A column containing 0-based list offsets.
- */
-std::unique_ptr<column> generate_clean_offsets(lists_column_view const& lists_column,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
-{
-  auto output_offsets = make_numeric_column(data_type{type_to_id<offset_type>()},
-                                            lists_column.size() + 1,
-                                            mask_state::UNALLOCATED,
-                                            stream,
-                                            mr);
-  thrust::transform(
-    rmm::exec_policy(stream),
-    lists_column.offsets_begin(),
-    lists_column.offsets_end(),
-    output_offsets->mutable_view().begin<offset_type>(),
-    [first = lists_column.offsets_begin()] __device__(auto offset) { return offset - *first; });
-  return output_offsets;
-}
-
-/**
- * @brief Transform a given lists column to a new lists column in which all the list entries holding
- * -NaN value are replaced by (positive) NaN.
- *
- * Replacing -NaN by NaN is necessary before sorting (individual) lists because the sorting API is
- * using radix sort, which compares bits of the number thus it may separate -NaN by NaN to the two
- * ends of the result column.
- */
-std::unique_ptr<column> replace_negative_nans_entries(column_view const& lists_entries,
-                                                      lists_column_view const& lists_column,
-                                                      rmm::cuda_stream_view stream)
-{
-  // We need to copy the offsets column of the input lists_column. Since the input lists_column may
-  // be sliced, we need to generate clean offsets (i.e., offsets starting from zero).
-  auto new_offsets =
-    generate_clean_offsets(lists_column, stream, rmm::mr::get_current_device_resource());
-  auto new_entries = type_dispatcher(
-    lists_entries.type(), detail::replace_negative_nans_dispatch{}, lists_entries, stream);
-
-  return make_lists_column(
-    lists_column.size(),
-    std::move(new_offsets),
-    std::move(new_entries),
-    lists_column.null_count(),
-    cudf::detail::copy_bitmask(
-      lists_column.parent(), stream, rmm::mr::get_current_device_resource()));
-}
-
-/**
- * @brief Populate list offsets for all list entries.
- *
- * Given an `offsets` column_view containing offsets of a lists column and a number of all list
- * entries in the column, generate an array that maps from each list entry to the offset of the list
- * containing that entry.
+ * Instead of regular 0-based indices, we need to use 1-based indices for later post-processing.
  *
  * @code{.pseudo}
- * num_entries = 10, offsets = { 0, 4, 6, 10 }
+ * num_lists = 3, num_entries = 10, offsets = { 0, 4, 6, 10 }
  * output = { 1, 1, 1, 1, 2, 2, 3, 3, 3, 3 }
  * @endcode
  *
- * @param num_entries The number of list entries.
- * @param offsets Column view to the list offsets.
+ * @param num_lists The size of the input lists column.
+ * @param num_entries The number of entries in the lists column.
+ * @param offsets_begin The pointer refers to data of list offsets.
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device resource used to allocate memory.
- * @return A column containing entry list offsets.
+ * @return An array containing 1-based list indices corresponding to each list entry.
  */
-std::unique_ptr<column> generate_entry_list_offsets(size_type num_entries,
-                                                    column_view const& offsets,
-                                                    rmm::cuda_stream_view stream)
+rmm::device_uvector<size_type> generate_entry_list_indices(size_type num_lists,
+                                                           size_type num_entries,
+                                                           offset_type const* offsets_begin,
+                                                           rmm::cuda_stream_view stream)
 {
-  auto entry_list_offsets = make_numeric_column(offsets.type(),
-                                                num_entries,
-                                                mask_state::UNALLOCATED,
-                                                stream,
-                                                rmm::mr::get_current_device_resource());
+  auto entry_list_indices = rmm::device_uvector<size_type>(num_entries, stream);
+
+  auto const input = thrust::make_transform_iterator(
+    offsets_begin, [offsets_begin] __device__(auto const idx) { return idx - *offsets_begin; });
   thrust::upper_bound(rmm::exec_policy(stream),
-                      offsets.begin<offset_type>(),
-                      offsets.end<offset_type>(),
-                      thrust::make_counting_iterator<offset_type>(0),
-                      thrust::make_counting_iterator<offset_type>(num_entries),
-                      entry_list_offsets->mutable_view().begin<offset_type>());
-  return entry_list_offsets;
+                      input,
+                      input + num_lists,
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(num_entries),
+                      entry_list_indices.begin());
+  return entry_list_indices;
 }
 
 /**
- * @brief Performs an equality comparison between two entries in a lists column.
+ * @brief Perform an equality comparison between two entries in a lists column, specialized from
+ * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and
+ * `nans_equal` when comparing floating-point numbers.
  *
- * For the two elements that are NOT in the same list in the lists column, they will always be
- * considered as different. If they are from the same list and their type is not floating point,
- * this functor will return the same comparison result as `cudf::element_equality_comparator`.
+ * For the two entries that are NOT in the same list, they will always be considered as different.
  *
- * For floating-point types, entries holding NaN value can be considered as different values or the
- * same value depending on the `nans_equal` parameter.
+ * If they are from the same list and their type is not floating point, this functor will return the
+ * same comparison result as `cudf::element_equality_comparator`.
  *
- * @tparam Type The data type of entries
- * @tparam nans_equal Flag to specify whether NaN entries should be considered as equal value (only
- * applicable for floating-point data column)
+ * For floating-point types, entries holding NaN value can be considered as different or the same
+ * value depending on the `nans_equal` parameter.
  */
 template <class Type>
 struct column_row_comparator_fn {
-  offset_type const* const list_offsets;
+  size_type const* const list_indices;
   column_device_view const lhs;
   column_device_view const rhs;
   null_equality const nulls_equal;
   bool const has_nulls;
   bool const nans_equal;
 
-  __host__ __device__ column_row_comparator_fn(offset_type const* const list_offsets,
+  __host__ __device__ column_row_comparator_fn(size_type const* const list_indices,
                                                column_device_view const& lhs,
                                                column_device_view const& rhs,
                                                null_equality const nulls_equal,
                                                bool const has_nulls,
                                                bool const nans_equal)
-    : list_offsets(list_offsets),
+    : list_indices(list_indices),
       lhs(lhs),
       rhs(rhs),
       nulls_equal(nulls_equal),
@@ -307,20 +221,20 @@ struct column_row_comparator_fn {
   {
   }
 
-  template <typename T, std::enable_if_t<!cuda::std::is_floating_point_v<T>>* = nullptr>
+  template <typename T = Type, std::enable_if_t<!cuda::std::is_floating_point_v<T>>* = nullptr>
   bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
   {
     return lhs_val == rhs_val;
   }
 
-  template <typename T, std::enable_if_t<cuda::std::is_floating_point_v<T>>* = nullptr>
+  template <typename T = Type, std::enable_if_t<cuda::std::is_floating_point_v<T>>* = nullptr>
   bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
   {
-    // If both element(i) and element(j) are NaNs and nans are considered as equal value then this
+    // If both element(i) and element(j) are NaNs and NaNs are considered as equal value then this
     // comparison will return `true`. This is the desired behavior in Pandas.
     if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; }
 
-    // If nans are considered as NOT equal, even both element(i) and element(j) are NaNs this
+    // If NaNs are considered as NOT equal, even both element(i) and element(j) are NaNs this
     // comparison will still return `false`. This is the desired behavior in Apache Spark.
     return lhs_val == rhs_val;
   }
@@ -328,7 +242,7 @@ struct column_row_comparator_fn {
   bool __device__ operator()(size_type i, size_type j) const noexcept
   {
     // Two entries are not considered for equality if they belong to different lists.
-    if (list_offsets[i] != list_offsets[j]) { return false; }
+    if (list_indices[i] != list_indices[j]) { return false; }
 
     if (has_nulls) {
       bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)};
@@ -340,7 +254,7 @@ struct column_row_comparator_fn {
       }
     }
 
-    return compare<Type>(lhs.element<Type>(i), lhs.element<Type>(j));
+    return compare(lhs.element<Type>(i), lhs.element<Type>(j));
   }
 };
 
@@ -348,20 +262,20 @@ struct column_row_comparator_fn {
  * @brief Struct used in type_dispatcher for comparing two entries in a lists column.
  */
 struct column_row_comparator_dispatch {
-  offset_type const* const list_offsets;
+  size_type const* const list_indices;
   column_device_view const lhs;
   column_device_view const rhs;
   null_equality const nulls_equal;
   bool const has_nulls;
   bool const nans_equal;
 
-  __device__ column_row_comparator_dispatch(offset_type const* const list_offsets,
+  __device__ column_row_comparator_dispatch(size_type const* const list_indices,
                                             column_device_view const& lhs,
                                             column_device_view const& rhs,
                                             null_equality const nulls_equal,
                                             bool const has_nulls,
                                             bool const nans_equal)
-    : list_offsets(list_offsets),
+    : list_indices(list_indices),
       lhs(lhs),
       rhs(rhs),
       nulls_equal(nulls_equal),
@@ -374,7 +288,7 @@ struct column_row_comparator_dispatch {
   bool __device__ operator()(size_type i, size_type j) const noexcept
   {
     return column_row_comparator_fn<Type>{
-      list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j);
+      list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j);
   }
 
   template <class Type, std::enable_if_t<!cudf::is_equality_comparable<Type, Type>()>* = nullptr>
@@ -386,24 +300,24 @@ struct column_row_comparator_dispatch {
 };
 
 /**
- * @brief Performs an equality comparison between rows of two tables using `column_row_comparator`
- * to compare rows of their corresponding columns.
+ * @brief Performs an equality comparison between rows of two tables using
+ * `column_row_comparator_fn` functor to compare rows of their corresponding columns.
  */
 struct table_row_comparator_fn {
-  offset_type const* const list_offsets;
+  size_type const* const list_indices;
   table_device_view const lhs;
   table_device_view const rhs;
   null_equality const nulls_equal;
   bool const has_nulls;
   bool const nans_equal;
 
-  table_row_comparator_fn(offset_type const* const list_offsets,
+  table_row_comparator_fn(size_type const* const list_indices,
                           table_device_view const& lhs,
                           table_device_view const& rhs,
                           null_equality const nulls_equal,
                           bool const has_nulls,
                           bool const nans_equal)
-    : list_offsets(list_offsets),
+    : list_indices(list_indices),
       lhs(lhs),
       rhs(rhs),
       nulls_equal(nulls_equal),
@@ -412,12 +326,12 @@ struct table_row_comparator_fn {
   {
   }
 
-  bool __device__ operator()(size_type i, size_type j) const noexcept
+  bool __device__ operator()(size_type i, size_type j) const
   {
     auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) {
       return type_dispatcher(
         lhs.type(),
-        column_row_comparator_dispatch{list_offsets, lhs, rhs, nulls_equal, has_nulls, nans_equal},
+        column_row_comparator_dispatch{list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal},
         i,
         j);
     };
@@ -427,133 +341,129 @@ struct table_row_comparator_fn {
 };
 
 /**
- *  @brief Struct used in type_dispatcher for copying indices of the list entries ignoring
- * duplicates.
+ *  @brief Struct used in type_dispatcher for copying indices of the list entries ignoring duplicate
+ * list entries.
  */
-struct get_unique_entries_dispatch {
+struct get_indices_of_unique_entries_dispatch {
   template <class Type,
             std::enable_if_t<!cudf::is_equality_comparable<Type, Type>() &&
                              !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  offset_type* operator()(offset_type const*,
-                          column_view const&,
-                          size_type,
-                          offset_type*,
-                          null_equality,
-                          nan_equality,
-                          bool,
-                          rmm::cuda_stream_view) const
+  size_type* operator()(size_type const*,
+                        column_view const&,
+                        size_type,
+                        size_type*,
+                        null_equality,
+                        nan_equality,
+                        bool,
+                        duplicate_keep_option,
+                        rmm::cuda_stream_view) const
   {
     CUDF_FAIL(
-      "`get_unique_entries_dispatch` cannot operate on types that are not equally comparable.");
+      "get_indices_of_unique_entries_dispatch cannot operate on types that are not equally "
+      "comparable or not STRUCT type.");
   }
 
   template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
-  offset_type* operator()(offset_type const* list_offsets,
-                          column_view const& all_lists_entries,
-                          size_type num_entries,
-                          offset_type* output_begin,
-                          null_equality nulls_equal,
-                          nan_equality nans_equal,
-                          bool has_nulls,
-                          rmm::cuda_stream_view stream) const noexcept
+  size_type* operator()(size_type const* list_indices,
+                        column_view const& all_lists_entries,
+                        size_type num_entries,
+                        size_type* output_begin,
+                        null_equality nulls_equal,
+                        nan_equality nans_equal,
+                        bool has_nulls,
+                        duplicate_keep_option keep_option,
+                        rmm::cuda_stream_view stream) const noexcept
   {
     auto const d_view = column_device_view::create(all_lists_entries, stream);
-    auto const comp   = column_row_comparator_fn<Type>{list_offsets,
+    auto const comp   = column_row_comparator_fn<Type>{list_indices,
                                                      *d_view,
                                                      *d_view,
                                                      nulls_equal,
                                                      has_nulls,
                                                      nans_equal == nan_equality::ALL_EQUAL};
-    return thrust::unique_copy(rmm::exec_policy(stream),
-                               thrust::make_counting_iterator(0),
-                               thrust::make_counting_iterator(num_entries),
-                               output_begin,
-                               comp);
+    return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
+                                     thrust::make_counting_iterator(num_entries),
+                                     output_begin,
+                                     comp,
+                                     keep_option,
+                                     stream);
   }
 
   template <class Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  offset_type* operator()(offset_type const* list_offsets,
-                          column_view const& all_lists_entries,
-                          size_type num_entries,
-                          offset_type* output_begin,
-                          null_equality nulls_equal,
-                          nan_equality nans_equal,
-                          bool has_nulls,
-                          rmm::cuda_stream_view stream) const noexcept
+  size_type* operator()(size_type const* list_indices,
+                        column_view const& all_lists_entries,
+                        size_type num_entries,
+                        size_type* output_begin,
+                        null_equality nulls_equal,
+                        nan_equality nans_equal,
+                        bool has_nulls,
+                        duplicate_keep_option keep_option,
+                        rmm::cuda_stream_view stream) const noexcept
   {
-    auto const entries_tview       = table_view{{all_lists_entries}};
-    auto const flatten_nullability = has_nested_nulls(entries_tview)
-                                       ? structs::detail::column_nullability::FORCE
-                                       : structs::detail::column_nullability::MATCH_INCOMING;
-    auto const entries_flattened   = cudf::structs::detail::flatten_nested_columns(
-      entries_tview, {order::ASCENDING}, {null_order::AFTER}, flatten_nullability);
-    auto const d_view = table_device_view::create(std::get<0>(entries_flattened), stream);
-
-    auto const comp = table_row_comparator_fn{list_offsets,
-                                              *d_view,
-                                              *d_view,
+    auto const flattened_entries = cudf::structs::detail::flatten_nested_columns(
+      table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {});
+    auto const dview_ptr = table_device_view::create(flattened_entries, stream);
+
+    auto const comp = table_row_comparator_fn{list_indices,
+                                              *dview_ptr,
+                                              *dview_ptr,
                                               nulls_equal,
                                               has_nulls,
                                               nans_equal == nan_equality::ALL_EQUAL};
-
-    return thrust::unique_copy(rmm::exec_policy(stream),
-                               thrust::make_counting_iterator(0),
-                               thrust::make_counting_iterator(num_entries),
-                               output_begin,
-                               comp);
+    return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
+                                     thrust::make_counting_iterator(num_entries),
+                                     output_begin,
+                                     comp,
+                                     keep_option,
+                                     stream);
   }
 };
 
 /**
- * @brief Copy list entries and entry list offsets ignoring duplicates.
- *
- * Given an array of all entries flattened from a list column and an array that maps each entry to
- * the offset of the list containing that entry, those entries and list offsets are copied into
- * new arrays such that the duplicated entries within each list will be ignored.
- *
- * @param all_lists_entries The input array containing all list entries.
- * @param entries_list_offsets A map from list entries to their corresponding list offsets.
- * @param nulls_equal Flag to specify whether null entries should be considered equal.
- * @param nans_equal Flag to specify whether NaN entries should be considered equal
- *        (only applicable for floating-point data column).
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device resource used to allocate memory.
- * @return A pair of columns, the first one contains unique list entries and the second one
- *         contains their corresponding list offsets.
+ * @brief Extract list entries and their corresponding (1-based) list indices ignoring duplicate
+ * entries.
  */
-std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
-  column_view const& all_lists_entries,
-  column_view const& entries_list_offsets,
+std::vector<std::unique_ptr<column>> get_unique_entries_and_list_indices(
+  column_view const& keys_entries,
+  std::optional<column_view> const& values_entries,
+  device_span<size_type const> entries_list_indices,
   null_equality nulls_equal,
   nan_equality nans_equal,
+  duplicate_keep_option keep_option,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const num_entries = all_lists_entries.size();
+  auto const num_entries = keys_entries.size();
 
-  // Allocate memory to store the indices of the unique entries.
-  auto unique_indices     = rmm::device_uvector<offset_type>(num_entries, stream);
+  // Allocate memory to store the indices of the unique key entries.
+  // These indices will be used as a gather map to collect keys and values.
+  auto unique_indices     = rmm::device_uvector<size_type>(num_entries, stream);
   auto const output_begin = unique_indices.begin();
-  auto const output_end   = type_dispatcher(all_lists_entries.type(),
-                                          get_unique_entries_dispatch{},
-                                          entries_list_offsets.begin<offset_type>(),
-                                          all_lists_entries,
+  auto const output_end   = type_dispatcher(keys_entries.type(),
+                                          get_indices_of_unique_entries_dispatch{},
+                                          entries_list_indices.begin(),
+                                          keys_entries,
                                           num_entries,
                                           output_begin,
                                           nulls_equal,
                                           nans_equal,
-                                          all_lists_entries.has_nulls(),
+                                          keys_entries.has_nulls(),
+                                          keep_option,
                                           stream);
 
-  auto gather_map = column_view(data_type{type_to_id<offset_type>()},
-                                static_cast<size_type>(thrust::distance(output_begin, output_end)),
-                                unique_indices.data());
-
-  // Collect unique entries and entry list offsets.
-  // The new null_count and bitmask of the unique entries will also be generated
-  // by the gather function.
-  return cudf::detail::gather(table_view{{all_lists_entries, entries_list_offsets}},
-                              gather_map,
+  auto const list_indices_view = column_view(data_type{type_to_id<size_type>()},
+                                             static_cast<size_type>(entries_list_indices.size()),
+                                             entries_list_indices.data());
+  auto const input_table       = values_entries
+                                   ? table_view{{keys_entries, values_entries.value(), list_indices_view}}
+                                   : table_view{{keys_entries, list_indices_view}};
+
+  // Collect unique entries and entry list indices.
+  // The new null_count and bitmask of the unique entries will also be generated by the gather
+  // function.
+  return cudf::detail::gather(input_table,
+                              device_span<size_type const>(
+                                unique_indices.data(), thrust::distance(output_begin, output_end)),
                               cudf::out_of_bounds_policy::DONT_CHECK,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
                               stream,
@@ -562,151 +472,258 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_offsets(
 }
 
 /**
- * @brief Generate list offsets from entry offsets.
+ * @brief Generate list offsets from entry list indices for the final result lists column(s).
  *
- * Generate an array of list offsets for the final result lists column. The list offsets of the
- * original lists column are also taken into account to make sure the result lists column will have
- * the same empty list rows (if any) as in the original lists column.
- *
- * @param num_entries The number of unique entries after removing duplicates.
- * @param entries_list_offsets The mapping from list entries to their list offsets.
- * @param original_offsets The list offsets of the original lists column, which will also be used to
- *        store the new list offsets.
+ * @param num_lists The number of lists.
+ * @param num_entries The number of extracted unique list entries.
+ * @param entries_list_indices The mapping from list entries to their (1-based) list indices.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device resource used to allocate memory.
  */
-void generate_offsets(size_type num_entries,
-                      column_view const& entries_list_offsets,
-                      mutable_column_view const& original_offsets,
-                      rmm::cuda_stream_view stream)
+std::unique_ptr<column> generate_output_offsets(size_type num_lists,
+                                                size_type num_entries,
+                                                column_view const& entries_list_indices,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
-  // Firstly, generate temporary list offsets for the unique entries, ignoring empty lists (if any).
-  // If entries_list_offsets = {1, 1, 1, 1, 2, 3, 3, 3, 4, 4 }, num_entries = 10,
-  // then new_offsets = { 0, 4, 5, 8, 10 }.
-  auto const new_offsets = allocate_like(
-    original_offsets, mask_allocation_policy::NEVER, rmm::mr::get_current_device_resource());
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<offset_type>(0),
-                  thrust::make_counting_iterator<offset_type>(num_entries + 1),
-                  new_offsets->mutable_view().begin<offset_type>(),
-                  [num_entries, offsets_ptr = entries_list_offsets.begin<offset_type>()] __device__(
-                    auto i) -> bool {
-                    return i == 0 || i == num_entries || offsets_ptr[i] != offsets_ptr[i - 1];
-                  });
-
-  // Generate a prefix sum of number of empty lists, storing inplace to the original lists
-  // offsets.
-  // If the original list offsets is { 0, 0, 5, 5, 6, 6 } (there are 2 empty lists),
-  // and new_offsets = { 0, 4, 6 }, then output = { 0, 1, 1, 2, 2, 3}.
-  auto const iter_trans_begin = cudf::detail::make_counting_transform_iterator(
-    0, [offsets = original_offsets.begin<offset_type>()] __device__(auto i) {
-      return (i > 0 && offsets[i] == offsets[i - 1]) ? 1 : 0;
-    });
-  thrust::inclusive_scan(rmm::exec_policy(stream),
-                         iter_trans_begin,
-                         iter_trans_begin + original_offsets.size(),
-                         original_offsets.begin<offset_type>());
-
-  // Generate the final list offsets.
-  // If the original list offsets are { 0, 0, 5, 5, 6, 6 }, the new offsets are { 0, 4, 6 },
-  // and the prefix sums of empty lists are { 0, 1, 1, 2, 2, 3 },
-  // then output = { 0, 0, 4, 4, 5, 5 }.
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<offset_type>(0),
-                    thrust::make_counting_iterator<offset_type>(original_offsets.size()),
-                    original_offsets.begin<offset_type>(),
-                    [prefix_sum_empty_lists = original_offsets.begin<offset_type>(),
-                     offsets = new_offsets->view().begin<offset_type>()] __device__(auto i) {
-                      return offsets[i - prefix_sum_empty_lists[i]];
-                    });
+  // Let consider an example:
+  // Given the original offsets of the input lists column is [0, 4, 5, 6, 7, 10, 11, 13].
+  // The original entries_list_indices is [1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 7, 7], and after
+  // extracting unique entries we have the entries_list_indices becomes [1, 1, 1, 4, 5, 5, 5, 7, 7]
+  // and num_lists is 7, num_entries is 9. These are the input to this function.
+  //
+  // Through extracting unique list entries, one entry in the list index 1 has been removed (first
+  // list, as we are using 1-based list index), and entries in the lists with indices {3, 3, 6} have
+  // been removed completely.
+
+  // This variable stores the (1-based) list indices of the unique entries but only one index value
+  // per non-empty list. Given the example above, we will have this array hold the values
+  // [1, 4, 5, 7].
+  auto list_indices = rmm::device_uvector<size_type>(num_lists, stream);
+
+  // Stores the non-zero numbers of unique entries per list.
+  // Given the example above, we will have this array contains the values [3, 1, 3, 2]
+  auto list_sizes = rmm::device_uvector<size_type>(num_lists, stream);
+
+  // Count the numbers of unique entries for each non-empty list.
+  auto const end                 = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                         entries_list_indices.template begin<size_type>(),
+                                         entries_list_indices.template end<size_type>(),
+                                         thrust::make_constant_iterator<size_type>(1),
+                                         list_indices.begin(),
+                                         list_sizes.begin());
+  auto const num_non_empty_lists = thrust::distance(list_indices.begin(), end.first);
+
+  // The output offsets for the output lists column(s).
+  auto new_offsets = rmm::device_uvector<offset_type>(num_lists + 1, stream, mr);
+
+  // The new offsets need to be filled with 0 value first.
+  thrust::uninitialized_fill_n(
+    rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0});
+
+  // Scatter non-zero sizes of the output lists into the correct positions.
+  // Given the example above, we will have new_offsets = [0, 3, 0, 0, 1, 3, 0, 2]
+  thrust::scatter(rmm::exec_policy(stream),
+                  list_sizes.begin(),
+                  list_sizes.begin() + num_non_empty_lists,
+                  list_indices.begin(),
+                  new_offsets.begin());
+
+  // Generate offsets from sizes.
+  // Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9]
+  thrust::inclusive_scan(
+    rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin());
+
+  // Done. Hope that your head didn't explode after reading till this point.
+  return std::make_unique<column>(
+    data_type{type_to_id<offset_type>()}, num_lists + 1, new_offsets.release());
 }
 
-}  // anonymous namespace
-
 /**
- * @copydoc cudf::lists::drop_list_duplicates
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @brief Common execution code called by all public `drop_list_duplicates` APIs.
  */
-std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
-                                             null_equality nulls_equal,
-                                             nan_equality nans_equal,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates_common(
+  lists_column_view const& keys,
+  std::optional<lists_column_view> const& values,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  duplicate_keep_option keep_option,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
-  if (lists_column.is_empty()) return cudf::empty_like(lists_column.parent());
-  if (auto const child_type = lists_column.child().type();
+  if (auto const child_type = keys.child().type();
       cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) {
-    CUDF_FAIL("Nested types other than STRUCT are not supported in `drop_list_duplicates`.");
+    CUDF_FAIL(
+      "Keys of nested types other than STRUCT are not supported in `drop_list_duplicates`.");
+  }
+
+  CUDF_EXPECTS(!values || keys.size() == values.value().size(),
+               "Keys and values columns must have the same size.");
+
+  if (keys.is_empty()) {
+    return std::pair{cudf::empty_like(keys.parent()),
+                     values ? cudf::empty_like(values.value().parent()) : nullptr};
   }
 
-  // Flatten all entries (depth = 1) of the lists column.
-  auto const lists_entries = lists_column.get_sliced_child(stream);
+  // The child column conotaining list entries.
+  auto const keys_child = keys.get_sliced_child(stream);
+
+  // Generate a mapping from list entries to their 1-based list indices for the keys column.
+  auto const entries_list_indices =
+    generate_entry_list_indices(keys.size(), keys_child.size(), keys.offsets_begin(), stream);
 
-  // sorted_lists will store the results of the original lists after calling segmented_sort.
-  auto const sorted_lists = [&]() {
-    // If nans_equal == ALL_EQUAL and the column contains lists of floating-point data type,
-    // we need to replace -NaN by NaN before sorting.
+  // Generate segmented sorted order for key entries.
+  // The keys column will be sorted (gathered) using this order.
+  auto const sorted_order = [&]() {
+    auto const list_indices_view = column_view(data_type{type_to_id<size_type>()},
+                                               static_cast<size_type>(entries_list_indices.size()),
+                                               entries_list_indices.data());
+
+    // If nans_equal == ALL_EQUAL and the keys column contains floating-point data type,
+    // we need to replace `-NaN` by `NaN` before sorting.
     auto const replace_negative_nan =
       nans_equal == nan_equality::ALL_EQUAL &&
-      type_dispatcher(
-        lists_entries.type(), detail::has_negative_nans_dispatch{}, lists_entries, stream);
+      type_dispatcher(keys_child.type(), has_negative_nans_dispatch{}, keys_child, stream);
+
     if (replace_negative_nan) {
-      auto const new_lists_column =
-        detail::replace_negative_nans_entries(lists_entries, lists_column, stream);
-      return detail::sort_lists(
-        lists_column_view(new_lists_column->view()), order::ASCENDING, null_order::AFTER, stream);
+      auto const replaced_nan_keys_child =
+        type_dispatcher(keys_child.type(), replace_negative_nans_dispatch{}, keys_child, stream);
+      return cudf::detail::stable_sorted_order(
+        table_view{{list_indices_view, replaced_nan_keys_child->view()}},
+        {order::ASCENDING, order::ASCENDING},
+        {null_order::AFTER, null_order::AFTER},
+        stream);
     } else {
-      return detail::sort_lists(lists_column, order::ASCENDING, null_order::AFTER, stream);
+      return cudf::detail::stable_sorted_order(table_view{{list_indices_view, keys_child}},
+                                               {order::ASCENDING, order::ASCENDING},
+                                               {null_order::AFTER, null_order::AFTER},
+                                               stream);
     }
   }();
 
-  auto const sorted_lists_entries =
-    lists_column_view(sorted_lists->view()).get_sliced_child(stream);
-
-  // Generate a 0-based offset column.
-  auto lists_offsets = detail::generate_clean_offsets(lists_column, stream, mr);
-
-  // Generate a mapping from list entries to offsets of the lists containing those entries.
-  auto const entries_list_offsets =
-    detail::generate_entry_list_offsets(sorted_lists_entries.size(), lists_offsets->view(), stream);
-
-  // Copy non-duplicated entries (along with their list offsets) to new arrays.
-  auto unique_entries_and_list_offsets = detail::get_unique_entries_and_list_offsets(
-    sorted_lists_entries, entries_list_offsets->view(), nulls_equal, nans_equal, stream, mr);
-
-  // Generate offsets for the new lists column.
-  detail::generate_offsets(unique_entries_and_list_offsets.front()->size(),
-                           unique_entries_and_list_offsets.back()->view(),
-                           lists_offsets->mutable_view(),
-                           stream);
-
-  // Construct a new lists column without duplicated entries.
-  // Reuse the null_count and bitmask of the lists_column: those are the null information for
-  // the list elements (rows).
-  // For the entries of those lists (rows), their null_count and bitmask were generated separately
-  // during the step `get_unique_entries_and_list_offsets` above.
-  return make_lists_column(lists_column.size(),
-                           std::move(lists_offsets),
-                           std::move(unique_entries_and_list_offsets.front()),
-                           lists_column.null_count(),
-                           cudf::detail::copy_bitmask(lists_column.parent(), stream, mr));
+  auto const sorting_table = values
+                               ? table_view{{keys_child, values.value().get_sliced_child(stream)}}
+                               : table_view{{keys_child}};
+  auto const sorted_table  = cudf::detail::gather(sorting_table,
+                                                 sorted_order->view(),
+                                                 out_of_bounds_policy::DONT_CHECK,
+                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                                 stream);
+
+  // Extract the segmented sorted key entries.
+  auto const sorted_keys_entries = sorted_table->get_column(0).view();
+  auto const sorted_values_entries =
+    values ? std::optional<column_view>(sorted_table->get_column(1).view()) : std::nullopt;
+
+  // Generate child columns containing unique entries (along with their list indices).
+  // null_count and bitmask of these columns will also be generated in this function.
+  auto unique_entries_and_list_indices = get_unique_entries_and_list_indices(sorted_keys_entries,
+                                                                             sorted_values_entries,
+                                                                             entries_list_indices,
+                                                                             nulls_equal,
+                                                                             nans_equal,
+                                                                             keep_option,
+                                                                             stream,
+                                                                             mr);
+
+  // Generate offsets for the output lists column(s).
+  auto output_offsets = generate_output_offsets(
+    keys.size(),
+    unique_entries_and_list_indices.front()->size(),  // num unique entries
+    unique_entries_and_list_indices.back()->view(),   // unique entries' list indices
+    stream,
+    mr);
+
+  // If the values lists column is not given, its corresponding output will be nullptr.
+  auto out_values =
+    values ? make_lists_column(keys.size(),
+                               std::make_unique<column>(output_offsets->view()),
+                               std::move(unique_entries_and_list_indices[1]),
+                               values.value().null_count(),
+                               cudf::detail::copy_bitmask(values.value().parent(), stream, mr))
+           : nullptr;
+
+  auto out_keys = make_lists_column(keys.size(),
+                                    std::move(output_offsets),
+                                    std::move(unique_entries_and_list_indices[0]),
+                                    keys.null_count(),
+                                    cudf::detail::copy_bitmask(keys.parent(), stream, mr));
+
+  return std::pair{std::move(out_keys), std::move(out_values)};
+}
+
+}  // anonymous namespace
+
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
+  lists_column_view const& keys,
+  lists_column_view const& values,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  duplicate_keep_option keep_option,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  return drop_list_duplicates_common(keys,
+                                     std::optional<lists_column_view>(values),
+                                     nulls_equal,
+                                     nans_equal,
+                                     keep_option,
+                                     stream,
+                                     mr);
+}
+
+std::unique_ptr<column> drop_list_duplicates(lists_column_view const& input,
+                                             null_equality nulls_equal,
+                                             nan_equality nans_equal,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  return drop_list_duplicates_common(input,
+                                     std::nullopt,
+                                     nulls_equal,
+                                     nans_equal,
+                                     duplicate_keep_option::KEEP_FIRST,
+                                     stream,
+                                     mr)
+    .first;
 }
 
 }  // namespace detail
 
 /**
- * @copydoc cudf::lists::drop_list_duplicates
+ * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
+ *                                            lists_column_view const&,
+ *                                            duplicate_keep_option,
+ *                                            null_equality,
+ *                                            nan_equality,
+ *                                            rmm::mr::device_memory_resource*)
  */
-std::unique_ptr<column> drop_list_duplicates(lists_column_view const& lists_column,
+std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
+  lists_column_view const& keys,
+  lists_column_view const& values,
+  duplicate_keep_option keep_option,
+  null_equality nulls_equal,
+  nan_equality nans_equal,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::drop_list_duplicates(
+    keys, values, nulls_equal, nans_equal, keep_option, rmm::cuda_stream_default, mr);
+}
+
+/**
+ * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
+ *                                            null_equality,
+ *                                            nan_equality,
+ *                                            rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<column> drop_list_duplicates(lists_column_view const& input,
                                              null_equality nulls_equal,
                                              nan_equality nans_equal,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_list_duplicates(
-    lists_column, nulls_equal, nans_equal, rmm::cuda_stream_default, mr);
+  return detail::drop_list_duplicates(input, nulls_equal, nans_equal, rmm::cuda_stream_default, mr);
 }
 
-}  // namespace lists
-}  // namespace cudf
+}  // namespace cudf::lists
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index c8ef4912392..381864e1a68 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -16,103 +16,131 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/sequence.hpp>
+#include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/extract.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/transform.h>
+#include <thrust/copy.h>
+#include <thrust/iterator/constant_iterator.h>
+
+#include <limits>
 
 namespace cudf {
 namespace lists {
 namespace detail {
-
 namespace {
 
 /**
- * @brief Convert index value for each sublist into a gather index for
- * the lists column's child column.
+ * @brief Helper to construct a column of indices, for use with `segmented_gather()`.
+ *
+ * When indices are specified as a column, e.g. `{5, -4, 3, -2, 1, null}`,
+ * the column returned is:                      `{5, -4, 3, -2, 1, MAX_SIZE_TYPE}`.
+ * All null indices are replaced with `MAX_SIZE_TYPE = numeric_limits<size_type>::max()`.
+ *
+ * The returned column can then be used to construct a lists column, for use
+ * with `segmented_gather()`.
+ */
+std::unique_ptr<cudf::column> make_index_child(column_view const& indices,
+                                               size_type,
+                                               rmm::cuda_stream_view stream)
+{
+  // New column, near identical to `indices`, except with null values replaced.
+  // `segmented_gather()` on a null index should produce a null row.
+  if (not indices.nullable()) { return std::make_unique<column>(indices, stream); }
+
+  auto const d_indices = column_device_view::create(indices);
+  // Replace null indices with MAX_SIZE_TYPE, so that gather() returns null for them.
+  auto const null_replaced_iter_begin =
+    cudf::detail::make_null_replacement_iterator(*d_indices, std::numeric_limits<size_type>::max());
+  auto index_child =
+    make_numeric_column(data_type{type_id::INT32}, indices.size(), mask_state::UNALLOCATED, stream);
+  thrust::copy_n(rmm::exec_policy(stream),
+                 null_replaced_iter_begin,
+                 indices.size(),
+                 index_child->mutable_view().begin<size_type>());
+  return index_child;
+}
+
+/**
+ * @brief Helper to construct a column of indices, for use with `segmented_gather()`.
+ *
+ * When indices are specified as a size_type, e.g. `7`,
+ * the column returned is: `{ 7, 7, 7, 7, 7 }`.
+ *
+ * The returned column can then be used to construct a lists column, for use
+ * with `segmented_gather()`.
+ */
+std::unique_ptr<cudf::column> make_index_child(size_type index,
+                                               size_type num_rows,
+                                               rmm::cuda_stream_view stream)
+{
+  auto index_child =  // [index, index, index, ..., index]
+    make_numeric_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED, stream);
+  thrust::fill_n(
+    rmm::exec_policy(stream), index_child->mutable_view().begin<size_type>(), num_rows, index);
+  return index_child;
+}
+
+/**
+ * @brief Helper to construct offsets column for an index vector.
+ *
+ * Constructs the sequence: `{ 0, 1, 2, 3, ... num_lists + 1}`.
+ * This may be used to construct an "index-list" column, where each list row
+ * has a single element.
  */
-template <bool PositiveIndex = true>
-struct map_index_fn {
-  column_device_view const d_offsets;  // offsets to each sublist (including validity mask)
-  size_type const index;               // index of element within each sublist
-  size_type const out_of_bounds;       // value to use to indicate out-of-bounds
-
-  __device__ int32_t operator()(size_type idx)
-  {
-    if (d_offsets.is_null(idx)) return out_of_bounds;
-    auto const offset = d_offsets.element<int32_t>(idx);
-    auto const length = d_offsets.element<int32_t>(idx + 1) - offset;
-    if (PositiveIndex)
-      return index < length ? index + offset : out_of_bounds;
-    else
-      return index >= -length ? length + index + offset : out_of_bounds;
-  }
-};
+std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_stream_view stream)
+{
+  return cudf::detail::sequence(
+    num_lists + 1, cudf::scalar_type_t<size_type>(0, true, stream), stream);
+}
 
 }  // namespace
 
 /**
  * @copydoc cudf::lists::extract_list_element
- *
+ * @tparam index_t The type used to specify the index values (either column_view or size_type)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
+template <typename index_t>
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
-                                             size_type index,
+                                             index_t const& index,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
-  if (lists_column.is_empty()) return empty_like(lists_column.child());
-  auto const offsets_column = lists_column.offsets();
-
-  // create a column_view with attributes of the parent and data from the offsets
-  column_view annotated_offsets(data_type{type_id::INT32},
-                                lists_column.size() + 1,
-                                offsets_column.data<int32_t>(),
-                                lists_column.null_mask(),
-                                lists_column.null_count(),
-                                lists_column.offset());
-
-  // create a gather map for extracting elements from the child column
-  auto gather_map = make_fixed_width_column(
-    data_type{type_id::INT32}, annotated_offsets.size() - 1, mask_state::UNALLOCATED, stream);
-  auto d_gather_map       = gather_map->mutable_view().data<int32_t>();
-  auto const child_column = lists_column.child();
-
-  // build the gather map using the offsets and the provided index
-  auto const d_column = column_device_view::create(annotated_offsets, stream);
-  if (index < 0)
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(gather_map->size()),
-                      d_gather_map,
-                      map_index_fn<false>{*d_column, index, child_column.size()});
-  else
-    thrust::transform(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(0),
-                      thrust::make_counting_iterator<size_type>(gather_map->size()),
-                      d_gather_map,
-                      map_index_fn<true>{*d_column, index, child_column.size()});
-
-  // call gather on the child column
-  auto result = cudf::detail::gather(table_view({child_column}),
-                                     gather_map->view(),
-                                     out_of_bounds_policy::NULLIFY,  // nullify-out-of-bounds
-                                     cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                     stream,
-                                     mr)
-                  ->release();
-  if (result.front()->null_count() == 0)
-    result.front()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
-  return std::unique_ptr<column>(std::move(result.front()));
+  auto const num_lists = lists_column.size();
+  if (num_lists == 0) { return empty_like(lists_column.child()); }
+
+  // Given an index (or indices vector), an index lists column may be constructed,
+  // with each list row having a single element.
+  // E.g.
+  // 1. If index = 7, index_lists_column = { {7}, {7}, {7}, {7}, ... }.
+  // 2. If indices = {4, 3, 2, 1, null},
+  //    index_lists_column = { {4}, {3}, {2}, {1}, {MAX_SIZE_TYPE} }.
+
+  auto const index_lists_column = make_lists_column(num_lists,
+                                                    make_index_offsets(num_lists, stream),
+                                                    make_index_child(index, num_lists, stream),
+                                                    0,
+                                                    {},
+                                                    stream);
+
+  auto extracted_lists = segmented_gather(
+    lists_column, index_lists_column->view(), out_of_bounds_policy::NULLIFY, stream, mr);
+
+  return std::move(extracted_lists->release().children[lists_column_view::child_column_index]);
 }
 
 }  // namespace detail
 
 /**
- * @copydoc cudf::lists::extract_list_element
+ * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
+ *                                            size_type,
+ *                                            rmm::mr::device_memory_resource*)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
@@ -121,5 +149,19 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
   return detail::extract_list_element(lists_column, index, rmm::cuda_stream_default, mr);
 }
 
+/**
+ * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
+ *                                            column_view const&,
+ *                                            rmm::mr::device_memory_resource*)
+ */
+std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
+                                             column_view const& indices,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(indices.size() == lists_column.size(),
+               "Index column must have as many elements as lists column.");
+  return detail::extract_list_element(lists_column, indices, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 9c1cbd8a8cc..0140dc56bab 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -36,7 +36,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 {
   if (size == 0) {
     return make_lists_column(0,
-                             make_empty_column(data_type{type_to_id<offset_type>()}),
+                             make_empty_column(type_to_id<offset_type>()),
                              empty_like(value.view()),
                              0,
                              cudf::detail::create_null_mask(0, mask_state::UNALLOCATED, stream, mr),
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index b085d1e77d1..088db226c24 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -268,6 +268,40 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                            input.null_count(),
                            std::move(null_mask));
 }
+
+std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
+                                          order column_order,
+                                          null_order null_precedence,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) { return empty_like(input.parent()); }
+
+  auto output_offset = make_numeric_column(
+    input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
+  thrust::transform(rmm::exec_policy(stream),
+                    input.offsets_begin(),
+                    input.offsets_end(),
+                    output_offset->mutable_view().template begin<size_type>(),
+                    [first = input.offsets_begin()] __device__(auto offset_index) {
+                      return offset_index - *first;
+                    });
+
+  auto const child              = input.get_sliced_child(stream);
+  auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
+                                                               table_view{{child}},
+                                                               output_offset->view(),
+                                                               {column_order},
+                                                               {null_precedence},
+                                                               stream,
+                                                               mr);
+
+  return make_lists_column(input.size(),
+                           std::move(output_offset),
+                           std::move(sorted_child_table->release().front()),
+                           input.null_count(),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
 }  // namespace detail
 
 std::unique_ptr<column> sort_lists(lists_column_view const& input,
@@ -279,5 +313,15 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
   return detail::sort_lists(input, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
+                                          order column_order,
+                                          null_order null_precedence,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::stable_sort_lists(
+    input, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 9aea59a195b..57c221b15ed 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -28,6 +28,8 @@
 
 #include <thrust/binary_search.h>
 
+using namespace cudf::tdigest;
+
 namespace cudf {
 namespace detail {
 namespace tdigest {
@@ -166,27 +168,20 @@ __global__ void compute_percentiles_kernel(device_span<offset_type const> tdiges
  *
  * @returns Column of doubles containing requested percentile values.
  */
-std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& input,
+std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& input,
                                                    column_view const& percentiles,
                                                    rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  lists_column_view lcv(input.child(centroid_column_index));
-  column_view min_col = input.child(min_column_index);
-  column_view max_col = input.child(max_column_index);
+  tdigest_column_view tdv(input);
 
   // offsets, representing the size of each tdigest
-  auto offsets = lcv.offsets();
-
-  // extract means and weights
-  auto data = lcv.parent().child(lists_column_view::child_column_index);
-  structs_column_view tdigest(data);
-  auto mean   = tdigest.child(mean_column_index);
-  auto weight = tdigest.child(weight_column_index);
+  auto offsets = tdv.centroids().offsets();
 
   // compute summed weights
+  auto weight             = tdv.weights();
   auto cumulative_weights = cudf::make_fixed_width_column(data_type{type_id::FLOAT64},
-                                                          mean.size(),
+                                                          weight.size(),
                                                           mask_state::UNALLOCATED,
                                                           stream,
                                                           rmm::mr::get_current_device_resource());
@@ -225,7 +220,7 @@ std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& in
     data_type{type_id::FLOAT64}, num_output_values, std::move(null_mask), null_count, stream, mr);
 
   auto centroids = cudf::detail::make_counting_transform_iterator(
-    0, make_centroid{mean.begin<double>(), weight.begin<double>()});
+    0, make_centroid{tdv.means().begin<double>(), tdv.weights().begin<double>()});
 
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(percentiles.size() * input.size(), block_size);
@@ -233,60 +228,61 @@ std::unique_ptr<column> compute_approx_percentiles(structs_column_view const& in
     {offsets.begin<offset_type>(), static_cast<size_t>(offsets.size())},
     *percentiles_cdv,
     centroids,
-    min_col.begin<double>(),
-    max_col.begin<double>(),
+    tdv.min_begin(),
+    tdv.max_begin(),
     cumulative_weights->view().begin<double>(),
     result->mutable_view().begin<double>());
 
   return result;
 }
 
-void check_is_valid_tdigest_column(column_view const& col)
+std::unique_ptr<column> make_tdigest_column(size_type num_rows,
+                                            std::unique_ptr<column>&& centroid_means,
+                                            std::unique_ptr<column>&& centroid_weights,
+                                            std::unique_ptr<column>&& tdigest_offsets,
+                                            std::unique_ptr<column>&& min_values,
+                                            std::unique_ptr<column>&& max_values,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
-  // sanity check that this is actually tdigest data
-  CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
-  CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows");
-  CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
-  CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column");
-
-  structs_column_view scv(col);
-  CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
-  CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
-               "Encountered invalid tdigest column");
-  CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
-               "Encountered invalid tdigest column");
-
-  lists_column_view lcv(scv.child(centroid_column_index));
-  auto data = lcv.child();
-  CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
-  CUDF_EXPECTS(data.num_children() == 2,
-               "Encountered tdigest column with an invalid number of children");
-  auto mean = data.child(mean_column_index);
-  CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column");
-  auto weight = data.child(weight_column_index);
-  CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column");
+  CUDF_EXPECTS(tdigest_offsets->size() == num_rows + 1,
+               "Encountered unexpected offset count in make_tdigest_column");
+  CUDF_EXPECTS(centroid_means->size() == centroid_weights->size(),
+               "Encountered unexpected centroid size mismatch in make_tdigest_column");
+  CUDF_EXPECTS(min_values->size() == num_rows,
+               "Encountered unexpected min value count in make_tdigest_column");
+  CUDF_EXPECTS(max_values->size() == num_rows,
+               "Encountered unexpected max value count in make_tdigest_column");
+
+  // inner struct column
+  auto const centroids_size = centroid_means->size();
+  std::vector<std::unique_ptr<column>> inner_children;
+  inner_children.push_back(std::move(centroid_means));
+  inner_children.push_back(std::move(centroid_weights));
+  auto tdigest_data =
+    cudf::make_structs_column(centroids_size, std::move(inner_children), 0, {}, stream, mr);
+
+  // grouped into lists
+  auto tdigest =
+    cudf::make_lists_column(num_rows, std::move(tdigest_offsets), std::move(tdigest_data), 0, {});
+
+  // create the final column
+  std::vector<std::unique_ptr<column>> children;
+  children.push_back(std::move(tdigest));
+  children.push_back(std::move(min_values));
+  children.push_back(std::move(max_values));
+  return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
-  // mean/weight columns
-  std::vector<std::unique_ptr<column>> inner_children;
-  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
-  inner_children.push_back(make_empty_column(data_type(type_id::FLOAT64)));
-
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<offset_type>(),
                offsets->mutable_view().end<offset_type>(),
                0);
-  auto list =
-    make_lists_column(1,
-                      std::move(offsets),
-                      cudf::make_structs_column(0, std::move(inner_children), 0, {}, stream, mr),
-                      0,
-                      {});
 
   auto min_col =
     cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
@@ -301,22 +297,24 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
                max_col->mutable_view().end<double>(),
                0);
 
-  std::vector<std::unique_ptr<column>> children;
-  children.push_back(std::move(list));
-  children.push_back(std::move(min_col));
-  children.push_back(std::move(max_col));
-
-  return make_structs_column(1, std::move(children), 0, {}, stream, mr);
+  return make_tdigest_column(1,
+                             make_empty_column(type_id::FLOAT64),
+                             make_empty_column(type_id::FLOAT64),
+                             std::move(offsets),
+                             std::move(min_col),
+                             std::move(max_col),
+                             stream,
+                             mr);
 }
 
 }  // namespace tdigest.
 
-std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  tdigest::check_is_valid_tdigest_column(input);
+  tdigest_column_view tdv(input);
   CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
                "percentile_approx expects float64 percentile inputs");
 
@@ -333,7 +331,7 @@ std::unique_ptr<column> percentile_approx(structs_column_view const& input,
     return cudf::make_lists_column(
       input.size(),
       std::move(offsets),
-      cudf::make_empty_column(data_type{type_id::FLOAT64}),
+      cudf::make_empty_column(type_id::FLOAT64),
       input.size(),
       cudf::detail::create_null_mask(
         input.size(), mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr));
@@ -341,24 +339,20 @@ std::unique_ptr<column> percentile_approx(structs_column_view const& input,
 
   // if any of the input digests are empty, nullify the corresponding output rows (values will be
   // uninitialized)
-  auto [bitmask, null_count] = [stream, mr, input]() {
-    lists_column_view lcv(input.child(tdigest::centroid_column_index));
-    auto iter = cudf::detail::make_counting_transform_iterator(
-      0, [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
-        return offsets[index + 1] - offsets[index] == 0 ? 1 : 0;
-      });
-    auto const null_count = thrust::reduce(rmm::exec_policy(stream), iter, iter + input.size(), 0);
+  auto [bitmask, null_count] = [stream, mr, &tdv]() {
+    auto tdigest_is_empty = thrust::make_transform_iterator(
+      tdv.size_begin(),
+      [] __device__(size_type tdigest_size) -> size_type { return tdigest_size == 0; });
+    auto const null_count =
+      thrust::reduce(rmm::exec_policy(stream), tdigest_is_empty, tdigest_is_empty + tdv.size(), 0);
     if (null_count == 0) {
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
-    return cudf::detail::valid_if(
-      thrust::make_counting_iterator(0),
-      thrust::make_counting_iterator(0) + input.size(),
-      [offsets = lcv.offsets().begin<offset_type>()] __device__(size_type index) {
-        return offsets[index + 1] - offsets[index] == 0 ? 0 : 1;
-      },
-      stream,
-      mr);
+    return cudf::detail::valid_if(tdigest_is_empty,
+                                  tdigest_is_empty + tdv.size(),
+                                  thrust::logical_not<size_type>{},
+                                  stream,
+                                  mr);
   }();
 
   return cudf::make_lists_column(
@@ -373,7 +367,7 @@ std::unique_ptr<column> percentile_approx(structs_column_view const& input,
 
 }  // namespace detail
 
-std::unique_ptr<column> percentile_approx(structs_column_view const& input,
+std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::mr::device_memory_resource* mr)
 {
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
new file mode 100644
index 00000000000..a86b40fd64a
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/tdigest/tdigest.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
+
+namespace cudf {
+namespace tdigest {
+
+using namespace cudf;
+
+tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(col)
+{
+  // sanity check that this is actually tdigest data
+  CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(col.size() > 0, "tdigest columns must have > 0 rows");
+  CUDF_EXPECTS(col.offset() == 0, "Encountered a sliced tdigest column");
+  CUDF_EXPECTS(col.nullable() == false, "Encountered nullable tdigest column");
+
+  structs_column_view scv(col);
+  CUDF_EXPECTS(scv.num_children() == 3, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(min_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+  CUDF_EXPECTS(scv.child(max_column_index).type().id() == type_id::FLOAT64,
+               "Encountered invalid tdigest column");
+
+  lists_column_view lcv(scv.child(centroid_column_index));
+  auto data = lcv.child();
+  CUDF_EXPECTS(data.type().id() == type_id::STRUCT, "Encountered invalid tdigest column");
+  CUDF_EXPECTS(data.num_children() == 2,
+               "Encountered tdigest column with an invalid number of children");
+  auto mean = data.child(mean_column_index);
+  CUDF_EXPECTS(mean.type().id() == type_id::FLOAT64, "Encountered invalid tdigest mean column");
+  auto weight = data.child(weight_column_index);
+  CUDF_EXPECTS(weight.type().id() == type_id::FLOAT64, "Encountered invalid tdigest weight column");
+}
+
+lists_column_view tdigest_column_view::centroids() const { return child(centroid_column_index); }
+
+column_view tdigest_column_view::means() const
+{
+  auto c = centroids();
+  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  return inner.child(mean_column_index);
+}
+
+column_view tdigest_column_view::weights() const
+{
+  auto c = centroids();
+  structs_column_view inner(c.parent().child(lists_column_view::child_column_index));
+  return inner.child(weight_column_index);
+}
+
+double const* tdigest_column_view::min_begin() const
+{
+  return child(min_column_index).begin<double>();
+}
+
+double const* tdigest_column_view::max_begin() const
+{
+  return child(max_column_index).begin<double>();
+}
+
+}  // namespace tdigest
+}  // namespace cudf
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 59a614664c9..161f892fbcb 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -142,8 +142,7 @@ struct minmax_functor {
   template <typename T>
   static constexpr bool is_supported()
   {
-    return !(cudf::is_fixed_point<T>() || std::is_same_v<T, cudf::list_view> ||
-             std::is_same_v<T, cudf::struct_view>);
+    return !(std::is_same_v<T, cudf::list_view> || std::is_same_v<T, cudf::struct_view>);
   }
 
   template <typename T>
@@ -187,15 +186,16 @@ struct minmax_functor {
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
     cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
   {
+    using storage_type = device_storage_type_t<T>;
     // compute minimum and maximum values
-    auto dev_result = reduce<T>(col, stream);
+    auto dev_result = reduce<storage_type>(col, stream);
     // create output scalars
     using ScalarType = cudf::scalar_type_t<T>;
     auto minimum     = new ScalarType(T{}, true, stream, mr);
     auto maximum     = new ScalarType(T{}, true, stream, mr);
     // copy dev_result to the output scalars
-    device_single_thread(assign_min_max<T>{dev_result.data(), minimum->data(), maximum->data()},
-                         stream);
+    device_single_thread(
+      assign_min_max<storage_type>{dev_result.data(), minimum->data(), maximum->data()}, stream);
     return {std::unique_ptr<scalar>(minimum), std::unique_ptr<scalar>(maximum)};
   }
 
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 0d3ac2d366f..6f9149a47e2 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -68,7 +68,8 @@ struct reduce_dispatch_functor {
       } break;
       case aggregation::MEDIAN: {
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
-        auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
+        auto valid_sorted_indices =
+          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
         auto col_ptr =
           quantile(col, {0.5}, interpolation::LINEAR, valid_sorted_indices, true, stream);
         return get_element(*col_ptr, 0, stream, mr);
@@ -78,7 +79,8 @@ struct reduce_dispatch_functor {
         CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
-        auto valid_sorted_indices = split(*sorted_indices, {col.size() - col.null_count()})[0];
+        auto valid_sorted_indices =
+          split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
 
         auto col_ptr = quantile(col,
                                 quantile_agg->_quantiles,
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 566b9aadea8..e7f1e867a41 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <structs/utilities.hpp>
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 
@@ -51,14 +50,12 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const superimposed = structs::detail::superimpose_parent_nulls(order_by, stream, mr);
-  table_view const order_table{{std::get<0>(superimposed)}};
-  auto const flattener = cudf::structs::detail::flatten_nested_columns(
-    order_table, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
-  auto const d_flat_order = table_device_view::create(std::get<0>(flattener), stream);
+  auto const flattened = cudf::structs::detail::flatten_nested_columns(
+    table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
+  auto const d_flat_order = table_device_view::create(flattened, stream);
   row_equality_comparator<has_nulls> comparator(*d_flat_order, *d_flat_order, true);
   auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                       order_table.num_rows(),
+                                       flattened.flattened_columns().num_rows(),
                                        mask_state::UNALLOCATED,
                                        stream,
                                        mr);
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 8352d16b2d0..5eeb6a1deb5 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -40,10 +40,6 @@ std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(
-    is_numeric(input.type()) || is_compound(input.type()) || is_fixed_point(input.type()),
-    "Unexpected non-numeric or non-string type.");
-
   switch (agg->kind) {
     case aggregation::SUM:
       return type_dispatcher<dispatch_storage_type>(
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 9dbf66bd078..02ecd6df4d9 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -18,10 +18,10 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/strings/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -29,6 +29,8 @@
 
 #include <thrust/scan.h>
 
+#include <type_traits>
+
 namespace cudf {
 namespace detail {
 
@@ -62,26 +64,25 @@ rmm::device_buffer mask_scan(column_view const& input_view,
 namespace {
 
 /**
- * @brief Strings inclusive scan operator
+ * @brief Min/Max inclusive scan operator
+ *
+ * This operator will accept index values, check them and then
+ * run the `Op` operation on the individual element objects.
+ * The returned result is the appropriate index value.
  *
  * This was specifically created to workaround a thrust issue
  * https://github.com/NVIDIA/thrust/issues/1479
  * where invalid values are passed to the operator.
- *
- * This operator will accept index values, check them and then
- * run the `Op` operation on the individual string_view objects.
- * The returned result is the appropriate index value.
  */
-template <typename Op>
-struct string_scan_operator {
-  column_device_view const col;          ///< strings column device view
-  string_view const null_replacement{};  ///< value used when element is null
-  bool const has_nulls;                  ///< true if col has null elements
-
-  string_scan_operator(column_device_view const& col, bool has_nulls = true)
-    : col{col}, null_replacement{Op::template identity<string_view>()}, has_nulls{has_nulls}
+template <typename Element, typename Op>
+struct min_max_scan_operator {
+  column_device_view const col;      ///< strings column device view
+  Element const null_replacement{};  ///< value used when element is null
+  bool const has_nulls;              ///< true if col has null elements
+
+  min_max_scan_operator(column_device_view const& col, bool has_nulls = true)
+    : col{col}, null_replacement{Op::template identity<Element>()}, has_nulls{has_nulls}
   {
-    CUDF_EXPECTS(type_id::STRING == col.type().id(), "the data type mismatch");
     // verify validity bitmask is non-null, otherwise, is_null_nocheck() will crash
     if (has_nulls) CUDF_EXPECTS(col.nullable(), "column with nulls must have a validity bitmask");
   }
@@ -92,41 +93,19 @@ struct string_scan_operator {
     // thrust::inclusive_scan may pass us garbage values so we need to protect ourselves;
     // in these cases the return value does not matter since the result is not used
     if (lhs < 0 || rhs < 0 || lhs >= col.size() || rhs >= col.size()) return 0;
-    string_view d_lhs =
-      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<string_view>(lhs);
-    string_view d_rhs =
-      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<string_view>(rhs);
+    Element d_lhs =
+      has_nulls && col.is_null_nocheck(lhs) ? null_replacement : col.element<Element>(lhs);
+    Element d_rhs =
+      has_nulls && col.is_null_nocheck(rhs) ? null_replacement : col.element<Element>(rhs);
     return Op{}(d_lhs, d_rhs) == d_lhs ? lhs : rhs;
   }
 };
 
-/**
- * @brief Dispatcher for running a Scan operation on an input column
- *
- * @tparam Op device binary operator
- */
-template <typename Op>
-struct scan_dispatcher {
- private:
-  template <typename T>
-  static constexpr bool is_string_supported()
-  {
-    return std::is_same_v<T, string_view> &&
-           (std::is_same_v<Op, DeviceMin> || std::is_same_v<Op, DeviceMax>);
-  }
-
-  template <typename T>
-  static constexpr bool is_supported()
-  {
-    return std::is_arithmetic<T>::value || is_string_supported<T>();
-  }
-
-  // for arithmetic types
-  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  auto inclusive_scan(column_view const& input_view,
-                      null_policy,
-                      rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+template <typename Op, typename T>
+struct scan_functor {
+  static std::unique_ptr<column> invoke(column_view const& input_view,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     auto output_column = detail::allocate_like(
       input_view, input_view.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -141,27 +120,48 @@ struct scan_dispatcher {
     CHECK_CUDA(stream.value());
     return output_column;
   }
+};
 
-  // for string type: only MIN and MAX are supported
-  template <typename T, std::enable_if_t<is_string_supported<T>()>* = nullptr>
-  std::unique_ptr<column> inclusive_scan(column_view const& input_view,
-                                         null_policy,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+template <typename Op>
+struct scan_functor<Op, cudf::string_view> {
+  static std::unique_ptr<column> invoke(column_view const& input_view,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
   {
     auto d_input = column_device_view::create(input_view, stream);
 
     // build indices of the scan operation results
     rmm::device_uvector<size_type> result(input_view.size(), stream);
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           thrust::counting_iterator<size_type>(0),
-                           thrust::counting_iterator<size_type>(input_view.size()),
-                           result.begin(),
-                           string_scan_operator<Op>{*d_input, input_view.has_nulls()});
+    thrust::inclusive_scan(
+      rmm::exec_policy(stream),
+      thrust::counting_iterator<size_type>(0),
+      thrust::counting_iterator<size_type>(input_view.size()),
+      result.begin(),
+      min_max_scan_operator<cudf::string_view, Op>{*d_input, input_view.has_nulls()});
 
     // call gather using the indices to build the output column
-    return cudf::strings::detail::gather(
-      strings_column_view(input_view), result.begin(), result.end(), false, stream, mr);
+    auto result_table = cudf::detail::gather(cudf::table_view({input_view}),
+                                             result,
+                                             out_of_bounds_policy::DONT_CHECK,
+                                             negative_index_policy::NOT_ALLOWED,
+                                             stream,
+                                             mr);
+    return std::move(result_table->release().front());
+  }
+};
+
+/**
+ * @brief Dispatcher for running a Scan operation on an input column
+ *
+ * @tparam Op device binary operator
+ */
+template <typename Op>
+struct scan_dispatcher {
+ private:
+  template <typename T>
+  static constexpr bool is_supported()
+  {
+    return std::is_invocable_v<Op, T, T> && !cudf::is_dictionary<T>();
   }
 
  public:
@@ -178,17 +178,17 @@ struct scan_dispatcher {
    */
   template <typename T, typename std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
-                                     null_policy null_handling,
+                                     null_policy,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return inclusive_scan<T>(input, null_handling, stream, mr);
+    return scan_functor<Op, T>::invoke(input, stream, mr);
   }
 
   template <typename T, typename... Args>
   std::enable_if_t<!is_supported<T>(), std::unique_ptr<column>> operator()(Args&&...)
   {
-    CUDF_FAIL("Non-arithmetic types not supported for inclusive scan");
+    CUDF_FAIL("Unsupported type for inclusive scan operation");
   }
 };
 
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index c8345a30f79..d83ad91d89b 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -115,7 +115,7 @@ std::unique_ptr<scalar> fixed_point_reduction(column_view const& col,
   }();
 
   auto const val = static_cast<cudf::scalar_type_t<Type>*>(result.get());
-  return cudf::make_fixed_point_scalar<DecimalXX>(val->value(), scale);
+  return cudf::make_fixed_point_scalar<DecimalXX>(val->value(stream), scale);
 }
 
 /**
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index b0373d41963..713b3b27a2b 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -281,7 +281,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     auto matched_view              = dictionary_column_view(input);
     std::unique_ptr<column> result = nullptr;
     auto add_scalar_key            = [&](scalar const& key, scalar const& key_replace) {
-      if (key.is_valid()) {
+      if (key.is_valid(stream)) {
         result = dictionary::detail::add_keys(
           matched_view, make_column_from_scalar(key_replace, 1, stream)->view(), stream, mr);
         matched_view = dictionary_column_view(result->view());
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 79c85ebe746..c1c26573692 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
 
 #include <thrust/transform_scan.h>
 
@@ -54,53 +55,18 @@ struct replace_nans_functor {
       return dinput.is_null(i) or !std::isnan(dinput.element<T>(i));
     };
 
-    if (input.has_nulls()) {
-      auto input_pair_iterator = make_pair_iterator<T, true>(*input_device_view);
-      if (replacement_nullable) {
-        auto replacement_pair_iterator = make_pair_iterator<T, true>(replacement);
-        return copy_if_else(true,
-                            input_pair_iterator,
-                            input_pair_iterator + size,
-                            replacement_pair_iterator,
-                            predicate,
-                            input.type(),
-                            stream,
-                            mr);
-      } else {
-        auto replacement_pair_iterator = make_pair_iterator<T, false>(replacement);
-        return copy_if_else(true,
-                            input_pair_iterator,
-                            input_pair_iterator + size,
-                            replacement_pair_iterator,
-                            predicate,
-                            input.type(),
-                            stream,
-                            mr);
-      }
-    } else {
-      auto input_pair_iterator = make_pair_iterator<T, false>(*input_device_view);
-      if (replacement_nullable) {
-        auto replacement_pair_iterator = make_pair_iterator<T, true>(replacement);
-        return copy_if_else(true,
-                            input_pair_iterator,
-                            input_pair_iterator + size,
-                            replacement_pair_iterator,
-                            predicate,
-                            input.type(),
-                            stream,
-                            mr);
-      } else {
-        auto replacement_pair_iterator = make_pair_iterator<T, false>(replacement);
-        return copy_if_else(false,
-                            input_pair_iterator,
-                            input_pair_iterator + size,
-                            replacement_pair_iterator,
-                            predicate,
-                            input.type(),
-                            stream,
-                            mr);
-      }
-    }
+    auto input_iterator =
+      make_optional_iterator<T>(*input_device_view, contains_nulls::DYNAMIC{}, input.has_nulls());
+    auto replacement_iterator =
+      make_optional_iterator<T>(replacement, contains_nulls::DYNAMIC{}, replacement_nullable);
+    return copy_if_else(input.has_nulls() or replacement_nullable,
+                        input_iterator,
+                        input_iterator + size,
+                        replacement_iterator,
+                        predicate,
+                        input.type(),
+                        stream,
+                        mr);
   }
 
   template <typename T, typename... Args>
@@ -224,33 +190,37 @@ void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view
     input.type(), normalize_nans_and_zeros_kernel_forwarder{}, *device_in, *device_out, stream);
 }
 
+std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::mr::device_memory_resource* mr)
+{
+  // output. copies the input
+  auto out = std::make_unique<column>(input, stream, mr);
+
+  // from device. unique_ptr which gets automatically cleaned up when we leave.
+  auto out_view = out->mutable_view();
+  normalize_nans_and_zeros(out_view, stream);
+
+  return out;
+}
+
 }  // namespace detail
 
 /**
- * @brief Makes all NaNs and zeroes positive.
+ * @brief Makes all Nans and zeroes positive.
  *
- * Converts floating point values from @p input using the following rules:
+ * Converts floating point values from @p in_out using the following rules:
  *        Convert  -NaN  -> NaN
  *        Convert  -0.0  -> 0.0
  *
- * @throws cudf::logic_error if column does not have floating point data type.
- * @param[in] input column_view representing input data
- * @param[in] mr device_memory_resource allocator for allocating output data
- *
- * @returns new column with the modified data
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  // output. copies the input
-  std::unique_ptr<column> out = std::make_unique<column>(input, rmm::cuda_stream_default, mr);
-  // from device. unique_ptr which gets automatically cleaned up when we leave.
-  auto out_view = out->mutable_view();
-
-  detail::normalize_nans_and_zeros(out_view, rmm::cuda_stream_default);
-
-  return out;
+  return detail::normalize_nans_and_zeros(input, rmm::cuda_stream_default, mr);
 }
 
 /**
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 2145dcc6b91..d12f18f4827 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -422,7 +422,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
-  if (!input.has_nulls() || !replacement.is_valid()) {
+  if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input, stream, mr);
   }
 
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 4cc8a84c868..e97c9f8109c 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -143,7 +143,7 @@ struct interleave_columns_impl<T, typename std::enable_if_t<std::is_same_v<T, cu
 
     auto strings_count = strings_columns.num_rows();
     if (strings_count == 0)  // All columns have 0 rows
-      return make_empty_column(data_type{type_id::STRING});
+      return make_empty_column(type_id::STRING);
 
     // Create device views from the strings columns.
     auto table       = table_device_view::create(strings_columns, stream);
diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh
index 970cb4d07d9..d9b67ff9da4 100644
--- a/cpp/src/rolling/rolling_detail.cuh
+++ b/cpp/src/rolling/rolling_detail.cuh
@@ -396,12 +396,12 @@ struct agg_specific_empty_output {
     }
 
     if constexpr (cudf::is_fixed_width<target_type>()) {
-      return cudf::make_empty_column(data_type{type_to_id<target_type>()});
+      return cudf::make_empty_column(type_to_id<target_type>());
     }
 
     if constexpr (op == aggregation::COLLECT_LIST) {
       return cudf::make_lists_column(
-        0, make_empty_column(data_type{type_to_id<offset_type>()}), empty_like(input), 0, {});
+        0, make_empty_column(type_to_id<offset_type>()), empty_like(input), 0, {});
     }
 
     return empty_like(input);
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index f982e7b99f2..5b7abdfcaf0 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <structs/utilities.hpp>
-
 #include <cudf/column/column.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 25418cf0f7e..d2876435780 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -121,12 +121,24 @@ std::unique_ptr<scalar> make_struct_scalar(host_span<column_view const> data,
 
 namespace {
 struct default_scalar_functor {
-  template <typename T>
+  data_type type;
+
+  template <typename T, typename std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
   {
     return make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
   }
+
+  template <typename T, typename std::enable_if_t<is_fixed_point<T>()>* = nullptr>
+  std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+  {
+    auto const scale_ = numeric::scale_type{type.scale()};
+    auto s            = make_fixed_point_scalar<T>(0, scale_, stream, mr);
+    s->set_valid_async(false, stream);
+    return s;
+  }
 };
 
 template <>
@@ -163,7 +175,7 @@ std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
                                                         rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  return type_dispatcher(type, default_scalar_functor{}, stream, mr);
+  return type_dispatcher(type, default_scalar_functor{type}, stream, mr);
 }
 
 std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu
index d2c459f0ed8..462d0678eab 100644
--- a/cpp/src/search/search.cu
+++ b/cpp/src/search/search.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/search.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
@@ -26,7 +27,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
-#include <structs/utilities.hpp>
 
 #include <hash/unordered_multiset.cuh>
 
@@ -112,13 +112,13 @@ std::unique_ptr<column> search_ordered(table_view const& t,
   auto const values_flattened =
     structs::detail::flatten_nested_columns(matched.second.back(), {}, {}, flatten_nullability);
 
-  auto const t_d      = table_device_view::create(std::get<0>(t_flattened), stream);
-  auto const values_d = table_device_view::create(std::get<0>(values_flattened), stream);
+  auto const t_d      = table_device_view::create(t_flattened, stream);
+  auto const values_d = table_device_view::create(values_flattened, stream);
   auto const& lhs     = find_first ? *t_d : *values_d;
   auto const& rhs     = find_first ? *values_d : *t_d;
 
-  auto const& column_order_flattened    = std::get<1>(t_flattened);
-  auto const& null_precedence_flattened = std::get<2>(t_flattened);
+  auto const& column_order_flattened    = t_flattened.orders();
+  auto const& null_precedence_flattened = t_flattened.null_orders();
   auto const column_order_dv = detail::make_device_uvector_async(column_order_flattened, stream);
   auto const null_precedence_dv =
     detail::make_device_uvector_async(null_precedence_flattened, stream);
@@ -154,14 +154,14 @@ struct contains_scalar_dispatch {
       auto found_iter = thrust::find(rmm::exec_policy(stream),
                                      d_col->pair_begin<Type, true>(),
                                      d_col->pair_end<Type, true>(),
-                                     thrust::make_pair(s->value(), true));
+                                     thrust::make_pair(s->value(stream), true));
 
       return found_iter != d_col->pair_end<Type, true>();
     } else {
       auto found_iter = thrust::find(rmm::exec_policy(stream),  //
                                      d_col->begin<Type>(),
                                      d_col->end<Type>(),
-                                     s->value());
+                                     s->value(stream));
 
       return found_iter != d_col->end<Type>();
     }
@@ -208,7 +208,7 @@ bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view
 {
   if (col.is_empty()) { return false; }
 
-  if (not value.is_valid()) { return col.has_nulls(); }
+  if (not value.is_valid(stream)) { return col.has_nulls(); }
 
   return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream);
 }
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index c9ab791395d..b08baaa0261 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -15,13 +15,13 @@
  */
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -39,10 +39,10 @@ auto is_sorted(cudf::table_view const& in,
   // 0-table_view, 1-column_order, 2-null_precedence, 3-validity_columns
   auto flattened = structs::detail::flatten_nested_columns(in, column_order, null_precedence);
 
-  auto const d_input           = table_device_view::create(std::get<0>(flattened), stream);
-  auto const d_column_order    = make_device_uvector_async(std::get<1>(flattened), stream);
+  auto const d_input           = table_device_view::create(flattened, stream);
+  auto const d_column_order    = make_device_uvector_async(flattened.orders(), stream);
   auto const d_null_precedence = has_nulls
-                                   ? make_device_uvector_async(std::get<2>(flattened), stream)
+                                   ? make_device_uvector_async(flattened.null_orders(), stream)
                                    : rmm::device_uvector<null_order>(0, stream);
 
   auto comparator = row_lexicographic_comparator<has_nulls>(
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 9c4a2786612..8947da7e1bb 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -14,36 +14,26 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/lists/list_device_view.cuh>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/table/table_device_view.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/logical.h>
-
-#include <algorithm>
-#include <iterator>
-#include <memory>
-#include <type_traits>
 
 namespace cudf {
 namespace detail {
 
+namespace {
+/**
+ * @brief The enum specifying which sorting method to use (stable or unstable).
+ */
+enum class sort_method { STABLE, UNSTABLE };
+
 // returns segment indices for each element for all segments.
 // first segment begin index = 0, last segment end index = num_rows.
 rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
@@ -65,12 +55,14 @@ rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
   return segment_ids;
 }
 
-std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
-                                               column_view const& segment_offsets,
-                                               std::vector<order> const& column_order,
-                                               std::vector<null_order> const& null_precedence,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> segmented_sorted_order_common(
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order,
+  std::vector<null_order> const& null_precedence,
+  sort_method sorting,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(segment_offsets.type() == data_type(type_to_id<size_type>()),
                "segment offsets should be size_type");
@@ -95,26 +87,39 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
   };
   auto child_column_order    = prepend_default(column_order, order::ASCENDING);
   auto child_null_precedence = prepend_default(null_precedence, null_order::AFTER);
+
   // return sorted order of child columns
-  return detail::sorted_order(segid_keys, child_column_order, child_null_precedence, stream, mr);
+  return sorting == sort_method::STABLE
+           ? detail::stable_sorted_order(
+               segid_keys, child_column_order, child_null_precedence, stream, mr)
+           : detail::sorted_order(
+               segid_keys, child_column_order, child_null_precedence, stream, mr);
 }
 
-std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
-                                             table_view const& keys,
-                                             column_view const& segment_offsets,
-                                             std::vector<order> const& column_order,
-                                             std::vector<null_order> const& null_precedence,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
+                                                    table_view const& keys,
+                                                    column_view const& segment_offsets,
+                                                    std::vector<order> const& column_order,
+                                                    std::vector<null_order> const& null_precedence,
+                                                    sort_method sorting,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
-  auto sorted_order = segmented_sorted_order(keys,
-                                             segment_offsets,
-                                             column_order,
-                                             null_precedence,
-                                             stream,
-                                             rmm::mr::get_current_device_resource());
+  auto sorted_order = sorting == sort_method::STABLE
+                        ? stable_segmented_sorted_order(keys,
+                                                        segment_offsets,
+                                                        column_order,
+                                                        null_precedence,
+                                                        stream,
+                                                        rmm::mr::get_current_device_resource())
+                        : segmented_sorted_order(keys,
+                                                 segment_offsets,
+                                                 column_order,
+                                                 null_precedence,
+                                                 stream,
+                                                 rmm::mr::get_current_device_resource());
 
   // Gather segmented sort of child value columns`
   return detail::gather(values,
@@ -124,8 +129,87 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                         stream,
                         mr);
 }
+
+}  // namespace
+
+std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
+                                               column_view const& segment_offsets,
+                                               std::vector<order> const& column_order,
+                                               std::vector<null_order> const& null_precedence,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  return segmented_sorted_order_common(
+    keys, segment_offsets, column_order, null_precedence, sort_method::UNSTABLE, stream, mr);
+}
+
+std::unique_ptr<column> stable_segmented_sorted_order(
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order,
+  std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  return segmented_sorted_order_common(
+    keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr);
+}
+
+std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
+                                             table_view const& keys,
+                                             column_view const& segment_offsets,
+                                             std::vector<order> const& column_order,
+                                             std::vector<null_order> const& null_precedence,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  return segmented_sort_by_key_common(values,
+                                      keys,
+                                      segment_offsets,
+                                      column_order,
+                                      null_precedence,
+                                      sort_method::UNSTABLE,
+                                      stream,
+                                      mr);
+}
+
+std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
+                                                    table_view const& keys,
+                                                    column_view const& segment_offsets,
+                                                    std::vector<order> const& column_order,
+                                                    std::vector<null_order> const& null_precedence,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  return segmented_sort_by_key_common(
+    values, keys, segment_offsets, column_order, null_precedence, sort_method::STABLE, stream, mr);
+}
+
 }  // namespace detail
 
+std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
+                                               column_view const& segment_offsets,
+                                               std::vector<order> const& column_order,
+                                               std::vector<null_order> const& null_precedence,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::segmented_sorted_order(
+    keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> stable_segmented_sorted_order(
+  table_view const& keys,
+  column_view const& segment_offsets,
+  std::vector<order> const& column_order,
+  std::vector<null_order> const& null_precedence,
+  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::stable_segmented_sorted_order(
+    keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              table_view const& keys,
                                              column_view const& segment_offsets,
@@ -138,4 +222,16 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
     values, keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
+                                                    table_view const& keys,
+                                                    column_view const& segment_offsets,
+                                                    std::vector<order> const& column_order,
+                                                    std::vector<null_order> const& null_precedence,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::stable_segmented_sort_by_key(
+    values, keys, segment_offsets, column_order, null_precedence, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace cudf
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index d707ece5ba9..25f0815e645 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -18,14 +18,13 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/gather.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <structs/utilities.hpp>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -124,13 +123,12 @@ std::unique_ptr<column> sorted_order(table_view input,
                    mutable_indices_view.end<size_type>(),
                    0);
 
-  auto flattened = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
-  auto& input_flattened     = std::get<0>(flattened);
-  auto device_table         = table_device_view::create(input_flattened, stream);
-  auto const d_column_order = make_device_uvector_async(std::get<1>(flattened), stream);
+  auto flattened    = structs::detail::flatten_nested_columns(input, column_order, null_precedence);
+  auto device_table = table_device_view::create(flattened, stream);
+  auto const d_column_order = make_device_uvector_async(flattened.orders(), stream);
 
-  if (has_nulls(input_flattened)) {
-    auto const d_null_precedence = make_device_uvector_async(std::get<2>(flattened), stream);
+  if (has_nulls(flattened)) {
+    auto const d_null_precedence = make_device_uvector_async(flattened.null_orders(), stream);
     auto const comparator        = row_lexicographic_comparator<true>(
       *device_table, *device_table, d_column_order.data(), d_null_precedence.data());
     if (stable) {
diff --git a/cpp/src/stream_compaction/drop_duplicates.cu b/cpp/src/stream_compaction/drop_duplicates.cu
index 30ea32fba8e..f236e6a5f53 100644
--- a/cpp/src/stream_compaction/drop_duplicates.cu
+++ b/cpp/src/stream_compaction/drop_duplicates.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <stream_compaction/drop_duplicates.cuh>
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -40,74 +42,6 @@
 namespace cudf {
 namespace detail {
 namespace {
-
-template <typename InputIterator, typename BinaryPredicate>
-struct unique_copy_fn {
-  /**
-   * @brief Functor for unique_copy()
-   *
-   * The logic here is equivalent to:
-   * @code
-   *   ((keep == duplicate_keep_option::KEEP_LAST) ||
-   *    (i == 0 || !comp(iter[i], iter[i - 1]))) &&
-   *   ((keep == duplicate_keep_option::KEEP_FIRST) ||
-   *    (i == last_index || !comp(iter[i], iter[i + 1])))
-   * @endcode
-   *
-   * It is written this way so that the `comp` comparator
-   * function appears only once minimizing the inlining
-   * required and reducing the compile time.
-   */
-  __device__ bool operator()(size_type i)
-  {
-    size_type boundary = 0;
-    size_type offset   = 1;
-    auto keep_option   = duplicate_keep_option::KEEP_LAST;
-    do {
-      if ((keep != keep_option) && (i != boundary) && comp(iter[i], iter[i - offset])) {
-        return false;
-      }
-      keep_option = duplicate_keep_option::KEEP_FIRST;
-      boundary    = last_index;
-      offset      = -offset;
-    } while (offset < 0);
-    return true;
-  }
-
-  InputIterator iter;
-  duplicate_keep_option const keep;
-  BinaryPredicate comp;
-  size_type const last_index;
-};
-
-}  // namespace
-
-/**
- * @brief Copies unique elements from the range [first, last) to output iterator `output`.
- *
- * In a consecutive group of duplicate elements, depending on parameter `keep`,
- * only the first element is copied, or the last element is copied or neither is copied.
- *
- * @return End of the range to which the elements are copied.
- */
-template <typename InputIterator, typename OutputIterator, typename BinaryPredicate>
-OutputIterator unique_copy(InputIterator first,
-                           InputIterator last,
-                           OutputIterator output,
-                           BinaryPredicate comp,
-                           duplicate_keep_option const keep,
-                           rmm::cuda_stream_view stream)
-{
-  size_type const last_index = thrust::distance(first, last) - 1;
-  return thrust::copy_if(
-    rmm::exec_policy(stream),
-    first,
-    last,
-    thrust::counting_iterator<size_type>(0),
-    output,
-    unique_copy_fn<InputIterator, BinaryPredicate>{first, keep, comp, last_index});
-}
-
 /**
  * @brief Create a column_view of index values which represent the row values
  * without duplicates as per @p `keep`
@@ -137,8 +71,9 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
                                        null_order null_precedence,
                                        rmm::cuda_stream_view stream)
 {
-  // sort only indices
-  auto sorted_indices = sorted_order(
+  // Sort only the indices.
+  // Note that stable sort must be used to maintain the order of duplicate elements.
+  auto sorted_indices = stable_sorted_order(
     keys,
     std::vector<order>{},
     std::vector<null_order>{static_cast<uint64_t>(keys.num_columns()), null_precedence},
@@ -178,6 +113,7 @@ column_view get_unique_ordered_indices(cudf::table_view const& keys,
       thrust::distance(unique_indices.begin<cudf::size_type>(), result_end));
   }
 }
+}  // namespace
 
 std::unique_ptr<table> drop_duplicates(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/src/stream_compaction/drop_duplicates.cuh b/cpp/src/stream_compaction/drop_duplicates.cuh
new file mode 100644
index 00000000000..3f8ae9507c2
--- /dev/null
+++ b/cpp/src/stream_compaction/drop_duplicates.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/stream_compaction.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace detail {
+template <typename InputIterator, typename BinaryPredicate>
+struct unique_copy_fn {
+  /**
+   * @brief Functor for unique_copy()
+   *
+   * The logic here is equivalent to:
+   * @code
+   *   ((keep == duplicate_keep_option::KEEP_LAST) ||
+   *    (i == 0 || !comp(iter[i], iter[i - 1]))) &&
+   *   ((keep == duplicate_keep_option::KEEP_FIRST) ||
+   *    (i == last_index || !comp(iter[i], iter[i + 1])))
+   * @endcode
+   *
+   * It is written this way so that the `comp` comparator
+   * function appears only once minimizing the inlining
+   * required and reducing the compile time.
+   */
+  __device__ bool operator()(size_type i)
+  {
+    size_type boundary = 0;
+    size_type offset   = 1;
+    auto keep_option   = duplicate_keep_option::KEEP_LAST;
+    do {
+      if ((keep != keep_option) && (i != boundary) && comp(iter[i], iter[i - offset])) {
+        return false;
+      }
+      keep_option = duplicate_keep_option::KEEP_FIRST;
+      boundary    = last_index;
+      offset      = -offset;
+    } while (offset < 0);
+    return true;
+  }
+
+  InputIterator iter;
+  duplicate_keep_option const keep;
+  BinaryPredicate comp;
+  size_type const last_index;
+};
+
+/**
+ * @brief Copies unique elements from the range [first, last) to output iterator `output`.
+ *
+ * In a consecutive group of duplicate elements, depending on parameter `keep`,
+ * only the first element is copied, or the last element is copied or neither is copied.
+ *
+ * @return End of the range to which the elements are copied.
+ */
+template <typename InputIterator, typename OutputIterator, typename BinaryPredicate>
+OutputIterator unique_copy(InputIterator first,
+                           InputIterator last,
+                           OutputIterator output,
+                           BinaryPredicate comp,
+                           duplicate_keep_option const keep,
+                           rmm::cuda_stream_view stream)
+{
+  size_type const last_index = thrust::distance(first, last) - 1;
+  return thrust::copy_if(
+    rmm::exec_policy(stream),
+    first,
+    last,
+    thrust::counting_iterator<size_type>(0),
+    output,
+    unique_copy_fn<InputIterator, BinaryPredicate>{first, keep, comp, last_index});
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 023e82dfe24..9618f325fce 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -35,6 +35,18 @@ namespace strings {
 namespace detail {
 namespace {
 
+using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;
+
+/**
+ * @brief Returns the given character's info flags.
+ */
+__device__ char_info get_char_info(character_flags_table_type const* d_flags, char_utf8 chr)
+{
+  auto const code_point = detail::utf8_to_codepoint(chr);
+  auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
+  return char_info{code_point, flag};
+}
+
 /**
  * @brief Base class for capitalize and title functors.
  *
@@ -60,15 +72,6 @@ struct base_fn {
   {
   }
 
-  using char_info = thrust::pair<uint32_t, detail::character_flags_table_type>;
-
-  __device__ char_info get_char_info(char_utf8 chr) const
-  {
-    auto const code_point = detail::utf8_to_codepoint(chr);
-    auto const flag = code_point <= 0x00FFFF ? d_flags[code_point] : character_flags_table_type{0};
-    return char_info{code_point, flag};
-  }
-
   __device__ int32_t convert_char(char_info const& info, char* d_buffer) const
   {
     auto const code_point = info.first;
@@ -111,7 +114,7 @@ struct base_fn {
     auto d_buffer     = d_chars ? d_chars + d_offsets[idx] : nullptr;
     bool capitalize   = true;
     for (auto const chr : d_str) {
-      auto const info        = get_char_info(chr);
+      auto const info        = get_char_info(d_flags, chr);
       auto const flag        = info.second;
       auto const change_case = capitalize ? IS_LOWER(flag) : IS_UPPER(flag);
 
@@ -178,6 +181,36 @@ struct title_fn : base_fn<title_fn> {
   };
 };
 
+/**
+ * @brief Functor for determining title format for each string in a column.
+ *
+ * The first letter of each word should be upper-case (IS_UPPER).
+ * All other characters should be lower-case (IS_LOWER).
+ * Non-upper/lower-case (IS_UPPER_OR_LOWER) characters delimit words.
+ */
+struct is_title_fn {
+  character_flags_table_type const* d_flags;
+  column_device_view const d_column;
+
+  __device__ bool operator()(size_type idx)
+  {
+    if (d_column.is_null(idx)) { return false; }
+    auto const d_str = d_column.element<string_view>(idx);
+
+    bool at_least_one_valid    = false;  // requires one or more cased characters
+    bool should_be_capitalized = true;   // current character should be upper-case
+    for (auto const chr : d_str) {
+      auto const flag = get_char_info(d_flags, chr).second;
+      if (IS_UPPER_OR_LOWER(flag)) {
+        if (should_be_capitalized == !IS_UPPER(flag)) return false;
+        at_least_one_valid = true;
+      }
+      should_be_capitalized = !IS_UPPER_OR_LOWER(flag);
+    }
+    return at_least_one_valid;
+  }
+};
+
 /**
  * @brief Common utility function for title() and capitalize().
  *
@@ -210,7 +243,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
-  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
   auto const d_column     = column_device_view::create(input.parent(), stream);
   auto const d_delimiters = delimiters.value(stream);
   return capitalizer(capitalize_fn{*d_column, d_delimiters}, input, stream, mr);
@@ -221,11 +254,31 @@ std::unique_ptr<column> title(strings_column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
   auto d_column = column_device_view::create(input.parent(), stream);
   return capitalizer(title_fn{*d_column, sequence_type}, input, stream, mr);
 }
 
+std::unique_ptr<column> is_title(strings_column_view const& input,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return make_empty_column(type_id::BOOL8);
+  auto results  = make_numeric_column(data_type{type_id::BOOL8},
+                                     input.size(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                     input.null_count(),
+                                     stream,
+                                     mr);
+  auto d_column = column_device_view::create(input.parent(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(input.size()),
+                    results->mutable_view().data<bool>(),
+                    is_title_fn{get_character_flags_table(), *d_column});
+  return results;
+}
+
 }  // namespace detail
 
 std::unique_ptr<column> capitalize(strings_column_view const& input,
@@ -244,5 +297,12 @@ std::unique_ptr<column> title(strings_column_view const& input,
   return detail::title(input, sequence_type, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<column> is_title(strings_column_view const& input,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::is_title(input, rmm::cuda_stream_default, mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index de7b6b2b560..2b4d832e85e 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -125,7 +125,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index f61e1c1241b..3d87197873f 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -152,7 +152,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                                                   rmm::cuda_stream_view stream,
                                                   rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
+  CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   if (types_to_remove == ALL_TYPES)
     CUDF_EXPECTS(types_to_keep != ALL_TYPES,
                  "Parameters types_to_remove and types_to_keep must not be both ALL_TYPES");
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 58a41277794..c4211fcf9fd 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -129,9 +129,9 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                "All columns must be of type string");
   auto const strings_count = strings_columns.num_rows();
   if (strings_count == 0)  // empty begets empty
-    return make_empty_column(data_type{type_id::STRING});
+    return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
   string_view d_separator(separator.data(), separator.size());
   auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
 
@@ -219,7 +219,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   CUDF_EXPECTS(strings_count == separators.size(),
                "Separators column should be the same size as the strings columns");
   if (strings_count == 0)  // Empty begets empty
-    return make_empty_column(data_type{type_id::STRING});
+    return make_empty_column(type_id::STRING);
 
   // Invalid output column strings - null rows
   string_view const invalid_str{nullptr, 0};
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index f7f3e63d213..c8d3e728805 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -43,9 +43,9 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
 
   string_view d_separator(separator.data(), separator.size());
   auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
@@ -93,7 +93,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
   // build null mask
   // only one entry so it is either all valid or all null
   auto const null_count =
-    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid());
+    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
   auto null_mask    = null_count
                         ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
                         : rmm::device_buffer{0, stream, mr};
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 3962dfcea57..9482d4db9b8 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -175,10 +175,10 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
 
   auto const num_rows = lists_strings_column.size();
-  if (num_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (num_rows == 0) { return make_empty_column(type_id::STRING); }
 
   // Accessing the child strings column of the lists column must be done by calling `child()` on the
   // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
@@ -252,7 +252,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                "Separators column should be the same size as the lists columns");
 
   auto const num_rows = lists_strings_column.size();
-  if (num_rows == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (num_rows == 0) { return make_empty_column(type_id::STRING); }
 
   // Accessing the child strings column of the lists column must be done by calling `child()` on the
   // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 628dbcb8755..9376a0082a8 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -66,6 +66,7 @@ struct contains_fn {
 std::unique_ptr<column> contains_util(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   bool beginning_only                 = false,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
@@ -75,7 +76,8 @@ std::unique_ptr<column> contains_util(
   auto d_column       = *strings_column;
 
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   auto d_prog = *prog;
 
   // create the output column
@@ -123,19 +125,21 @@ std::unique_ptr<column> contains_util(
 std::unique_ptr<column> contains_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, false, stream, mr);
+  return contains_util(strings, pattern, flags, false, stream, mr);
 }
 
 std::unique_ptr<column> matches_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  return contains_util(strings, pattern, true, stream, mr);
+  return contains_util(strings, pattern, flags, true, stream, mr);
 }
 
 }  // namespace detail
@@ -144,18 +148,20 @@ std::unique_ptr<column> matches_re(
 
 std::unique_ptr<column> contains_re(strings_column_view const& strings,
                                     std::string const& pattern,
+                                    regex_flags const flags,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, pattern, rmm::cuda_stream_default, mr);
+  return detail::contains_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
 std::unique_ptr<column> matches_re(strings_column_view const& strings,
                                    std::string const& pattern,
+                                   regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, pattern, rmm::cuda_stream_default, mr);
+  return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
 namespace detail {
@@ -190,6 +196,7 @@ struct count_fn {
 std::unique_ptr<column> count_re(
   strings_column_view const& strings,
   std::string const& pattern,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -198,7 +205,8 @@ std::unique_ptr<column> count_re(
   auto d_column       = *strings_column;
 
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
   auto d_prog = *prog;
 
   // create the output column
@@ -247,10 +255,11 @@ std::unique_ptr<column> count_re(
 
 std::unique_ptr<column> count_re(strings_column_view const& strings,
                                  std::string const& pattern,
+                                 regex_flags const flags,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, pattern, rmm::cuda_stream_default, mr);
+  return detail::count_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index acfce097eed..0691adc9eb7 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -46,7 +46,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::BOOL8}, 0);
 
-  CUDF_EXPECTS(true_string.is_valid() && true_string.size() > 0,
+  CUDF_EXPECTS(true_string.is_valid(stream) && true_string.size() > 0,
                "Parameter true_string must not be empty.");
   auto d_true = string_view(true_string.data(), true_string.size());
 
@@ -96,13 +96,13 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = booleans.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(booleans.type().id() == type_id::BOOL8, "Input column must be boolean type");
-  CUDF_EXPECTS(true_string.is_valid() && true_string.size() > 0,
+  CUDF_EXPECTS(true_string.is_valid(stream) && true_string.size() > 0,
                "Parameter true_string must not be empty.");
   auto d_true = string_view(true_string.data(), true_string.size());
-  CUDF_EXPECTS(false_string.is_valid() && false_string.size() > 0,
+  CUDF_EXPECTS(false_string.is_valid(stream) && false_string.size() > 0,
                "Parameter false_string must not be empty.");
   auto d_false = string_view(false_string.data(), false_string.size());
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index c0a20e1e47e..51a6a796ba3 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -608,7 +608,7 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = input.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8});
+  if (strings_count == 0) return make_empty_column(type_id::BOOL8);
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
@@ -1096,7 +1096,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  if (timestamps.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (timestamps.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
   CUDF_EXPECTS(names.is_empty() || names.size() == format_names_size,
@@ -1109,7 +1109,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
   format_compiler compiler(format, stream,
     specifier_map{{'w', 1}, {'W', 2}, {'u', 1}, {'U', 2}, {'V', 2}, {'G', 4},
                   {'a', 3}, {'A', 3}, {'b', 3}, {'B', 3}});
-  // clang-format on                                         
+  // clang-format on
   auto const d_format_items = compiler.format_items();
   auto const d_timestamps   = column_device_view::create(timestamps, stream);
 
@@ -1126,7 +1126,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                              std::move(offsets_column),
                              std::move(chars_column),
                              timestamps.null_count(),
-                             cudf::detail::copy_bitmask(timestamps,stream,mr));
+                             cudf::detail::copy_bitmask(timestamps, stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index eb733f0a302..f286149ea46 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -701,7 +701,7 @@ std::unique_ptr<column> from_durations(column_view const& durations,
                                        rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = durations.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   return type_dispatcher(
     durations.type(), dispatch_from_durations_fn{}, durations, format, stream, mr);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index f49a439a257..c29aa6560e9 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -314,7 +314,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr);
 }
 
@@ -379,7 +379,7 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return cudf::make_empty_column(data_type{type_id::BOOL8});
+  if (input.is_empty()) return cudf::make_empty_column(type_id::BOOL8);
   return type_dispatcher(
     decimal_type, dispatch_is_fixed_point_fn{}, input, decimal_type, stream, mr);
 }
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 39215dd2721..366d4fe7d42 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -526,7 +526,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = floats.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   return type_dispatcher(floats.type(), dispatch_from_floats_fn{}, floats, stream, mr);
 }
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 964fdccc849..e6edcde2d48 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -267,7 +267,7 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) { return cudf::make_empty_column(data_type{type_id::STRING}); }
+  if (input.is_empty()) { return cudf::make_empty_column(type_id::STRING); }
   return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
 }
 
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index e22283522e8..072e367e19a 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -169,7 +169,7 @@ std::unique_ptr<column> is_integer(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) { return cudf::make_empty_column(data_type{type_id::BOOL8}); }
+  if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
 }
 
@@ -390,7 +390,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   return type_dispatcher(integers.type(), dispatch_from_integers_fn{}, integers, stream, mr);
 }
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 5b708c52dd2..9006a998b61 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -164,7 +164,7 @@ std::unique_ptr<column> integers_to_ipv4(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = integers.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
new file mode 100644
index 00000000000..7f325bf29ed
--- /dev/null
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/convert/convert_lists.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/string_view.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace {
+
+// position of the element separator string (e.g. comma ',') within the separators column
+constexpr size_type separator_index = 0;
+// position of the enclosure strings (e.g. []) within the separators column
+constexpr size_type left_brace_index  = 1;
+constexpr size_type right_brace_index = 2;
+
+/**
+ * @brief Pending separator type for `stack_item`
+ */
+enum class item_separator : int8_t { NONE, ELEMENT, LIST };
+
+/**
+ * @brief Stack item used to manage nested lists.
+ *
+ * Each item includes the current range and the pending separator.
+ */
+struct alignas(8) stack_item {
+  size_type left_idx;
+  size_type right_idx;
+  item_separator separator{item_separator::NONE};
+};
+
+/**
+ * @brief Formatting lists functor.
+ *
+ * This formats the input list column into individual strings using the
+ * specified separators and null-representation (na_rep) string.
+ *
+ * Recursion is simulated by using stack allocating per output string.
+ */
+struct format_lists_fn {
+  column_device_view const d_input;
+  column_device_view const d_separators;
+  string_view const d_na_rep;
+  stack_item* d_stack;
+  size_type const max_depth;
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  __device__ column_device_view get_nested_child(size_type idx)
+  {
+    auto current = d_input;
+    while (idx > 0) {
+      current = current.child(cudf::lists_column_view::child_column_index);
+      --idx;
+    }
+    return current;
+  }
+
+  __device__ size_type write_separator(char*& d_output, size_type sep_idx = separator_index)
+  {
+    auto d_str = [&] {
+      if (d_separators.size() > sep_idx) return d_separators.element<string_view>(sep_idx);
+      if (sep_idx == left_brace_index) return string_view("[", 1);
+      if (sep_idx == right_brace_index) return string_view("]", 1);
+      return string_view(",", 1);
+    }();
+    if (d_output) d_output = copy_string(d_output, d_str);
+    return d_str.size_bytes();
+  }
+
+  __device__ size_type write_na_rep(char*& d_output)
+  {
+    if (d_output) d_output = copy_string(d_output, d_na_rep);
+    return d_na_rep.size_bytes();
+  }
+
+  __device__ size_type write_strings(column_device_view const& col,
+                                     size_type left_idx,
+                                     size_type right_idx,
+                                     char* d_output)
+  {
+    size_type bytes = 0;
+    for (size_type idx = left_idx; idx < right_idx; ++idx) {
+      if (col.is_null(idx)) {
+        bytes += write_na_rep(d_output);  // e.g. 'NULL'
+      } else {
+        auto d_str = col.element<string_view>(idx);
+        if (d_output) d_output = copy_string(d_output, d_str);
+        bytes += d_str.size_bytes();
+      }
+      if (idx + 1 < right_idx) {
+        bytes += write_separator(d_output);  // e.g. comma ','
+      }
+    }
+    return bytes;
+  }
+
+  __device__ void operator()(size_type idx)
+  {
+    size_type bytes = 0;
+    char* d_output  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+
+    // push first item to the stack
+    auto item_stack         = d_stack + idx * max_depth;
+    auto stack_idx          = size_type{0};
+    item_stack[stack_idx++] = stack_item{idx, idx + 1};
+
+    // process until stack is empty
+    while (stack_idx > 0) {
+      --stack_idx;  // pop from stack
+      auto const item = item_stack[stack_idx];
+      auto const view = get_nested_child(stack_idx);
+
+      auto offsets   = view.child(cudf::lists_column_view::offsets_column_index);
+      auto d_offsets = offsets.data<offset_type>() + view.offset();
+
+      // add pending separator
+      if (item.separator == item_separator::LIST) {
+        bytes += write_separator(d_output, right_brace_index);
+      } else if (item.separator == item_separator::ELEMENT) {
+        bytes += write_separator(d_output, separator_index);
+      }
+
+      // loop through the child elements for the current view
+      for (auto jdx = item.left_idx; jdx < item.right_idx; ++jdx) {
+        auto const lhs = d_offsets[jdx];
+        auto const rhs = d_offsets[jdx + 1];
+
+        if (view.is_null(jdx)) {
+          bytes += write_na_rep(d_output);  // e.g. 'NULL'
+        } else if (lhs == rhs) {            // e.g. '[]'
+          bytes += write_separator(d_output, left_brace_index);
+          bytes += write_separator(d_output, right_brace_index);
+        } else {
+          auto child = view.child(cudf::lists_column_view::child_column_index);
+          bytes += write_separator(d_output, left_brace_index);
+
+          // if child is a list type, then recurse into it
+          if (child.type().id() == type_id::LIST) {
+            // push current state to the stack
+            item_stack[stack_idx++] =
+              stack_item{jdx + 1,
+                         item.right_idx,
+                         jdx + 1 < item.right_idx ? item_separator::ELEMENT : item_separator::LIST};
+            // push child to the stack
+            item_stack[stack_idx++] = stack_item{lhs, rhs};
+            break;  // back to the stack (while-loop)
+          }
+
+          // otherwise, the child is a strings column;
+          // write out the string elements
+          auto const size = write_strings(child, lhs, rhs, d_output);
+          bytes += size;
+          if (d_output) d_output += size;
+
+          bytes += write_separator(d_output, right_brace_index);
+        }
+
+        // write element separator (e.g. comma ',') if not at the end
+        if (jdx + 1 < item.right_idx) { bytes += write_separator(d_output); }
+      }
+    }
+
+    if (!d_chars) d_offsets[idx] = bytes;
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<column> format_list_column(lists_column_view const& input,
+                                           string_scalar const& na_rep,
+                                           strings_column_view const& separators,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
+
+  size_type depth = 1;  // count the depth to the strings column
+  auto child_col  = input.child();
+  while (child_col.type().id() == type_id::LIST) {
+    child_col = cudf::lists_column_view(child_col).child();
+    ++depth;
+  }
+  CUDF_EXPECTS(child_col.type().id() == type_id::STRING, "lists child must be a STRING column");
+
+  CUDF_EXPECTS(separators.size() == 0 || separators.size() == 3,
+               "Invalid number of separator strings");
+  CUDF_EXPECTS(na_rep.is_valid(stream), "Null replacement string must be valid");
+
+  // create stack memory for processing nested lists
+  auto stack_buffer = rmm::device_uvector<stack_item>(input.size() * depth, stream);
+
+  auto const d_input      = column_device_view::create(input.parent(), stream);
+  auto const d_separators = column_device_view::create(separators.parent(), stream);
+  auto const d_na_rep     = na_rep.value(stream);
+
+  auto children = cudf::strings::detail::make_strings_children(
+    format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
+    input.size(),
+    stream,
+    mr);
+
+  return make_strings_column(
+    input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{});
+}
+
+}  // namespace detail
+
+// external API
+
+std::unique_ptr<column> format_list_column(lists_column_view const& input,
+                                           string_scalar const& na_rep,
+                                           strings_column_view const& separators,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::format_list_column(input, na_rep, separators, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index fb494a9634c..20935febf21 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -132,7 +132,7 @@ std::unique_ptr<column> url_encode(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
@@ -391,7 +391,7 @@ std::unique_ptr<column> url_decode(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   constexpr int num_warps_per_threadblock = 4;
   constexpr int threadblock_size          = num_warps_per_threadblock * cudf::detail::warp_size;
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index f131f0e40a3..db8b37a9592 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -214,7 +214,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   auto const total_bytes          = std::get<5>(device_views);
   auto const offsets_count        = strings_count + 1;
 
-  if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total number of strings is too large for cudf column");
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 24572576498..e722ad520b3 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/strings/detail/copying.hpp>
@@ -34,7 +34,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
   if (end < 0 || end > strings.size()) end = strings.size();
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
   auto const strings_count  = end - start;
@@ -42,7 +42,9 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
 
   // slice the offsets child column
   auto offsets_column = std::make_unique<cudf::column>(
-    cudf::slice(strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}).front(),
+    cudf::detail::slice(
+      strings.offsets(), {offsets_offset, offsets_offset + strings_count + 1}, stream)
+      .front(),
     stream,
     mr);
   auto const chars_offset =
@@ -61,7 +63,9 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
   auto const data_size =
     cudf::detail::get_value<int32_t>(offsets_column->view(), strings_count, stream);
   auto chars_column = std::make_unique<cudf::column>(
-    cudf::slice(strings.chars(), {chars_offset, chars_offset + data_size}).front(), stream, mr);
+    cudf::detail::slice(strings.chars(), {chars_offset, chars_offset + data_size}, stream).front(),
+    stream,
+    mr);
 
   // slice the null mask
   auto null_mask = cudf::detail::copy_bitmask(
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index b4219585b78..024c8d2924d 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -96,7 +96,9 @@ std::unique_ptr<column> shift(strings_column_view const& input,
 
   // output offsets column is the same size as the input
   auto const input_offsets =
-    cudf::slice(input.offsets(), {input.offset(), input.offset() + input.size() + 1}).front();
+    cudf::detail::slice(
+      input.offsets(), {input.offset(), input.offset() + input.size() + 1}, stream)
+      .front();
   auto const offsets_size = input_offsets.size();
   auto offsets_column     = cudf::detail::allocate_like(
     input_offsets, offsets_size, mask_allocation_policy::NEVER, stream, mr);
diff --git a/cpp/src/strings/extract.cu b/cpp/src/strings/extract.cu
index d12f5c534a5..c4076dd61c1 100644
--- a/cpp/src/strings/extract.cu
+++ b/cpp/src/strings/extract.cu
@@ -19,13 +19,13 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -47,29 +47,36 @@ using string_index_pair = thrust::pair<const char*, size_type>;
 template <int stack_size>
 struct extract_fn {
   reprog_device prog;
-  column_device_view d_strings;
-  size_type column_index;
+  column_device_view const d_strings;
+  cudf::detail::device_2dspan<string_index_pair> d_indices;
 
-  __device__ string_index_pair operator()(size_type idx)
+  __device__ void operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) return string_index_pair{nullptr, 0};
-    string_view d_str = d_strings.element<string_view>(idx);
-    string_index_pair result{nullptr, 0};
-    int32_t begin = 0;
-    int32_t end   = -1;  // handles empty strings automatically
-    if (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
-      auto extracted = prog.extract<stack_size>(idx, d_str, begin, end, column_index);
-      if (extracted) {
-        auto const offset = d_str.byte_offset(extracted.value().first);
-        // build index-pair
-        result = string_index_pair{d_str.data() + offset,
-                                   d_str.byte_offset(extracted.value().second) - offset};
+    auto const groups = prog.group_counts();
+    auto d_output     = d_indices[idx];
+
+    if (d_strings.is_valid(idx)) {
+      auto const d_str = d_strings.element<string_view>(idx);
+      int32_t begin    = 0;
+      int32_t end      = -1;  // handles empty strings automatically
+      if (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+        for (auto col_idx = 0; col_idx < groups; ++col_idx) {
+          auto const extracted = prog.extract<stack_size>(idx, d_str, begin, end, col_idx);
+          d_output[col_idx]    = [&] {
+            if (!extracted) return string_index_pair{nullptr, 0};
+            auto const offset = d_str.byte_offset((*extracted).first);
+            return string_index_pair{d_str.data() + offset,
+                                     d_str.byte_offset((*extracted).second) - offset};
+          }();
+        }
+        return;
       }
     }
-    return result;
+
+    // if null row or no match found, fill the output with null entries
+    thrust::fill(thrust::seq, d_output.begin(), d_output.end(), string_index_pair{nullptr, 0});
   }
 };
-
 }  // namespace
 
 //
@@ -79,9 +86,9 @@ std::unique_ptr<table> extract(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  auto strings_count  = strings.size();
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
+  auto const strings_count  = strings.size();
+  auto const strings_column = column_device_view::create(strings.parent(), stream);
+  auto const d_strings      = *strings_column;
 
   // compile regex into device object
   auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
@@ -90,41 +97,50 @@ std::unique_ptr<table> extract(
   auto const groups = d_prog.group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
 
+  rmm::device_uvector<string_index_pair> indices(strings_count * groups, stream);
+  cudf::detail::device_2dspan<string_index_pair> d_indices(indices.data(), strings_count, groups);
+
+  auto const regex_insts = d_prog.insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       strings_count,
+                       extract_fn<RX_STACK_SMALL>{d_prog, d_strings, d_indices});
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       strings_count,
+                       extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, d_indices});
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       strings_count,
+                       extract_fn<RX_STACK_LARGE>{d_prog, d_strings, d_indices});
+  } else {
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       strings_count,
+                       extract_fn<RX_STACK_ANY>{d_prog, d_strings, d_indices});
+  }
+
   // build a result column for each group
-  std::vector<std::unique_ptr<column>> results;
-  auto regex_insts = d_prog.insts_counts();
-
-  for (int32_t column_index = 0; column_index < groups; ++column_index) {
-    rmm::device_uvector<string_index_pair> indices(strings_count, stream);
-
-    if (regex_insts <= RX_SMALL_INSTS) {
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        extract_fn<RX_STACK_SMALL>{d_prog, d_strings, column_index});
-    } else if (regex_insts <= RX_MEDIUM_INSTS) {
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        extract_fn<RX_STACK_MEDIUM>{d_prog, d_strings, column_index});
-    } else if (regex_insts <= RX_LARGE_INSTS) {
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        extract_fn<RX_STACK_LARGE>{d_prog, d_strings, column_index});
-    } else {
-      thrust::transform(rmm::exec_policy(stream),
-                        thrust::make_counting_iterator<size_type>(0),
-                        thrust::make_counting_iterator<size_type>(strings_count),
-                        indices.begin(),
-                        extract_fn<RX_STACK_ANY>{d_prog, d_strings, column_index});
-    }
+  std::vector<std::unique_ptr<column>> results(groups);
+  auto make_strings_lambda = [&](size_type column_index) {
+    // this iterator transposes the extract results into column order
+    auto indices_itr =
+      thrust::make_permutation_iterator(indices.begin(),
+                                        cudf::detail::make_counting_transform_iterator(
+                                          0, [column_index, groups] __device__(size_type idx) {
+                                            return (idx * groups) + column_index;
+                                          }));
+    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+  };
+
+  std::transform(thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(groups),
+                 results.begin(),
+                 make_strings_lambda);
 
-    results.emplace_back(make_strings_column(indices, stream, mr));
-  }
   return std::make_unique<table>(std::move(results));
 }
 
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 56ff0ecffd0..eff010775dc 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -40,7 +40,7 @@ std::unique_ptr<column> fill(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS((begin >= 0) && (end <= strings_count),
                "Parameters [begin,end) are outside the range of the provided strings column");
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 5408cc43d1f..883a7fada75 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -115,8 +115,8 @@ std::unique_ptr<column> filter_characters(
   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
-  CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   cudf::string_view d_replacement(replacement.data(), replacement.size());
 
   // convert input table for copy to device memory
diff --git a/cpp/src/strings/find.cu b/cpp/src/strings/find.cu
index 8e40b7ce7e8..45b23d848c0 100644
--- a/cpp/src/strings/find.cu
+++ b/cpp/src/strings/find.cu
@@ -60,7 +60,7 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   if ((stop > 0) && (start > stop)) CUDF_FAIL("Parameter start must be less than stop.");
   //
@@ -184,9 +184,9 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::BOOL8});
+  if (strings_count == 0) return make_empty_column(type_id::BOOL8);
 
-  CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   if (target.size() == 0)  // empty target string returns true
   {
     auto const true_scalar = make_fixed_width_scalar<bool>(true, stream);
@@ -244,7 +244,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     rmm::cuda_stream_view stream,
                                     rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::BOOL8});
+  if (strings.is_empty()) return make_empty_column(type_id::BOOL8);
 
   CUDF_EXPECTS(targets.size() == strings.size(),
                "strings and targets column must be the same size");
diff --git a/cpp/src/strings/find_multiple.cu b/cpp/src/strings/find_multiple.cu
index 28a733bd204..72e7081cb7a 100644
--- a/cpp/src/strings/find_multiple.cu
+++ b/cpp/src/strings/find_multiple.cu
@@ -37,7 +37,7 @@ std::unique_ptr<column> find_multiple(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::INT32});
+  if (strings_count == 0) return make_empty_column(type_id::INT32);
   auto targets_count = targets.size();
   CUDF_EXPECTS(targets_count > 0, "Must include at least one search target");
   CUDF_EXPECTS(!targets.has_nulls(), "Search targets cannot contain null strings");
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 9baa84e00be..20868077cf4 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -61,7 +61,7 @@ std::unique_ptr<column> pad(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
   char_utf8 d_fill_char    = 0;
   size_type fill_char_size = to_char_utf8(fill_char.c_str(), d_fill_char);
@@ -153,7 +153,7 @@ std::unique_ptr<column> zfill(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 0e00221dabf..0e3dcb93826 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <strings/regex/regcomp.h>
+
 #include <cudf/utilities/error.hpp>
 
 #include <string.h>
@@ -523,6 +524,8 @@ class regex_compiler {
   bool lastwasand;
   int nbra;
 
+  regex_flags flags;
+
   inline void pushand(int f, int l) { andstack.push_back({f, l}); }
 
   inline Node popand(int op)
@@ -567,11 +570,11 @@ class regex_compiler {
         case LBRA: /* must have been RBRA */
           op1                                    = popand('(');
           id_inst2                               = m_prog.add_inst(RBRA);
-          m_prog.inst_at(id_inst2).u1.subid      = ator.subid;  // subidstack[subidstack.size()-1];
+          m_prog.inst_at(id_inst2).u1.subid      = ator.subid;
           m_prog.inst_at(op1.id_last).u2.next_id = id_inst2;
           id_inst1                               = m_prog.add_inst(LBRA);
-          m_prog.inst_at(id_inst1).u1.subid   = ator.subid;  // subidstack[subidstack.size() - 1];
-          m_prog.inst_at(id_inst1).u2.next_id = op1.id_first;
+          m_prog.inst_at(id_inst1).u1.subid      = ator.subid;
+          m_prog.inst_at(id_inst1).u2.next_id    = op1.id_first;
           pushand(id_inst1, id_inst2);
           return;
         case OR:
@@ -664,10 +667,13 @@ class regex_compiler {
   {
     if (lastwasand) Operator(CAT); /* catenate is implicit */
     int inst_id = m_prog.add_inst(t);
-    if (t == CCLASS || t == NCCLASS)
+    if (t == CCLASS || t == NCCLASS) {
       m_prog.inst_at(inst_id).u1.cls_id = yyclass_id;
-    else if (t == CHAR || t == BOL || t == EOL)
+    } else if (t == CHAR) {
       m_prog.inst_at(inst_id).u1.c = yy;
+    } else if (t == BOL || t == EOL) {
+      m_prog.inst_at(inst_id).u1.c = is_multiline(flags) ? yy : '\n';
+    }
     pushand(inst_id, inst_id);
     lastwasand = true;
   }
@@ -766,13 +772,20 @@ class regex_compiler {
   }
 
  public:
-  regex_compiler(const char32_t* pattern, int dot_type, reprog& prog)
-    : m_prog(prog), cursubid(0), pushsubid(0), lastwasand(false), nbra(0), yy(0), yyclass_id(0)
+  regex_compiler(const char32_t* pattern, regex_flags const flags, reprog& prog)
+    : m_prog(prog),
+      cursubid(0),
+      pushsubid(0),
+      lastwasand(false),
+      nbra(0),
+      flags(flags),
+      yy(0),
+      yyclass_id(0)
   {
     // Parse
     std::vector<regex_parser::Item> items;
     {
-      regex_parser parser(pattern, dot_type, m_prog);
+      regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, m_prog);
 
       // Expand counted repetitions
       if (parser.m_has_counted)
@@ -822,11 +835,12 @@ class regex_compiler {
 };
 
 // Convert pattern into program
-reprog reprog::create_from(const char32_t* pattern)
+reprog reprog::create_from(const char32_t* pattern, regex_flags const flags)
 {
   reprog rtn;
-  regex_compiler compiler(pattern, ANY, rtn);  // future feature: ANYNL
-  // rtn->print();
+  regex_compiler compiler(pattern, flags, rtn);
+  // for debugging, it can be helpful to call rtn.print(flags) here to dump
+  // out the instructions that have been created from the given pattern
   return rtn;
 }
 
@@ -912,8 +926,10 @@ void reprog::optimize2()
   _startinst_ids.push_back(-1);  // terminator mark
 }
 
-void reprog::print()
+#ifndef NDBUG
+void reprog::print(regex_flags const flags)
 {
+  printf("Flags = 0x%08x\n", static_cast<uint32_t>(flags));
   printf("Instructions:\n");
   for (std::size_t i = 0; i < _insts.size(); i++) {
     const reinst& inst = _insts[i];
@@ -941,8 +957,26 @@ void reprog::print()
       case ANY: printf("ANY, nextid= %d", inst.u2.next_id); break;
       case ANYNL: printf("ANYNL, nextid= %d", inst.u2.next_id); break;
       case NOP: printf("NOP, nextid= %d", inst.u2.next_id); break;
-      case BOL: printf("BOL, c = '%c', nextid= %d", inst.u1.c, inst.u2.next_id); break;
-      case EOL: printf("EOL, c = '%c', nextid= %d", inst.u1.c, inst.u2.next_id); break;
+      case BOL: {
+        printf("BOL, c = ");
+        if (inst.u1.c == '\n') {
+          printf("'\\n'");
+        } else {
+          printf("'%c'", inst.u1.c);
+        }
+        printf(", nextid= %d", inst.u2.next_id);
+        break;
+      }
+      case EOL: {
+        printf("EOL, c = ");
+        if (inst.u1.c == '\n') {
+          printf("'\\n'");
+        } else {
+          printf("'%c'", inst.u1.c);
+        }
+        printf(", nextid= %d", inst.u2.next_id);
+        break;
+      }
       case CCLASS: printf("CCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id); break;
       case NCCLASS:
         printf("NCCLASS, cls_id=%d , nextid= %d", inst.u1.cls_id, inst.u2.next_id);
@@ -992,6 +1026,7 @@ void reprog::print()
   }
   if (_num_capturing_groups) printf("Number of capturing groups: %d\n", _num_capturing_groups);
 }
+#endif
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 90bbc90f622..63d7933eebe 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <cudf/strings/regex/flags.hpp>
+
 #include <string>
 #include <vector>
 
@@ -89,7 +92,7 @@ class reprog {
    * @brief Parses the given regex pattern and compiles
    * into a list of chained instructions.
    */
-  static reprog create_from(const char32_t* pattern);
+  static reprog create_from(const char32_t* pattern, regex_flags const flags);
 
   int32_t add_inst(int32_t type);
   int32_t add_inst(reinst inst);
@@ -113,7 +116,7 @@ class reprog {
 
   void optimize1();
   void optimize2();
-  void print();  // for debugging
+  void print(regex_flags const flags);
 
  private:
   std::vector<reinst> _insts;
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 564f742b2cd..27556d90b1b 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -17,6 +17,7 @@
 
 #include <strings/regex/regcomp.h>
 
+#include <cudf/strings/regex/flags.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -95,17 +96,32 @@ class reprog_device {
    * regex.
    *
    * @param pattern The regex pattern to compile.
-   * @param cp_flags The code-point lookup table for character types.
+   * @param codepoint_flags The code point lookup table for character types.
    * @param strings_count Number of strings that will be evaluated.
-   * @param stream CUDA stream for asynchronous memory allocations. To ensure correct
-   * synchronization on destruction, the same stream should be used for all operations with the
-   * created objects.
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @return The program device object.
    */
   static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
     std::string const& pattern,
-    const uint8_t* cp_flags,
-    int32_t strings_count,
+    uint8_t const* codepoint_flags,
+    size_type strings_count,
+    rmm::cuda_stream_view stream);
+
+  /**
+   * @brief Create the device program instance from a regex pattern.
+   *
+   * @param pattern The regex pattern to compile.
+   * @param re_flags Regex flags for interpreting special characters in the pattern.
+   * @param codepoint_flags The code point lookup table for character types.
+   * @param strings_count Number of strings that will be evaluated.
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return The program device object.
+   */
+  static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
+    std::string const& pattern,
+    regex_flags const re_flags,
+    uint8_t const* codepoint_flags,
+    size_type strings_count,
     rmm::cuda_stream_view stream);
 
   /**
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 854fce15fd4..66e99756615 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -198,10 +198,10 @@ __device__ inline int32_t reprog_device::regexec(
 {
   int32_t match                   = 0;
   auto checkstart                 = jnk.starttype;
-  auto txtlen                     = dstr.length();
   auto pos                        = begin;
   auto eos                        = end;
   char32_t c                      = 0;
+  auto last_character             = false;
   string_view::const_iterator itr = string_view::const_iterator(dstr, pos);
 
   jnk.list1->reset();
@@ -235,7 +235,9 @@ __device__ inline int32_t reprog_device::regexec(
         jnk.list1->activate(ids[i++], (group_id == 0 ? pos : -1), -1);
     }
 
-    c = static_cast<char32_t>(pos >= txtlen ? 0 : *itr);
+    last_character = (pos >= dstr.length());
+
+    c = static_cast<char32_t>(last_character ? 0 : *itr);
 
     // expand LBRA, RBRA, BOL, EOL, BOW, NBOW, and OR
     bool expanded = false;
@@ -274,7 +276,7 @@ __device__ inline int32_t reprog_device::regexec(
             }
             break;
           case EOL:
-            if ((c == 0) || (inst->u1.c == '$' && c == '\n')) {
+            if (last_character || (inst->u1.c == '$' && c == '\n')) {
               id_activate = inst->u2.next_id;
               expanded    = true;
             }
@@ -360,7 +362,7 @@ __device__ inline int32_t reprog_device::regexec(
     ++itr;
     swaplist(jnk.list1, jnk.list2);
     checkstart = jnk.list1->size > 0 ? 0 : 1;
-  } while (c && (jnk.list1->size > 0 || match == 0));
+  } while (!last_character && (jnk.list1->size > 0 || match == 0));
 
   return match;
 }
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cu
index bd040eecaa6..4f93bbd6e7b 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cu
@@ -72,16 +72,27 @@ reprog_device::reprog_device(reprog& prog)
 {
 }
 
+std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
+  std::string const& pattern,
+  uint8_t const* codepoint_flags,
+  size_type strings_count,
+  rmm::cuda_stream_view stream)
+{
+  return reprog_device::create(
+    pattern, regex_flags::MULTILINE, codepoint_flags, strings_count, stream);
+}
+
 // Create instance of the reprog that can be passed into a device kernel
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
   std::string const& pattern,
+  regex_flags const flags,
   uint8_t const* codepoint_flags,
-  int32_t strings_count,
+  size_type strings_count,
   rmm::cuda_stream_view stream)
 {
   std::vector<char32_t> pattern32 = string_to_char32_vector(pattern);
   // compile pattern into host object
-  reprog h_prog = reprog::create_from(pattern32.data());
+  reprog h_prog = reprog::create_from(pattern32.data(), flags);
   // compute size to hold all the member data
   auto insts_count   = h_prog.insts_count();
   auto classes_count = h_prog.classes_count();
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 2e919c5000c..2e5be9e55f6 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -141,7 +141,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
-  if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   if (repeat_times <= 0) {
     // If the number of repetitions is not positive, each row of the output strings column will be
@@ -302,7 +302,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
   }
 
   auto const strings_count = input.size();
-  if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); }
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
   auto const strings_dv_ptr      = column_device_view::create(input.parent(), stream);
   auto const repeat_times_dv_ptr = column_device_view::create(repeat_times, stream);
@@ -342,7 +342,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
 
   auto const strings_count = input.size();
   if (strings_count == 0) {
-    return std::make_pair(make_empty_column(data_type{type_to_id<size_type>()}), int64_t{0});
+    return std::make_pair(make_empty_column(type_to_id<size_type>()), int64_t{0});
   }
 
   auto output_sizes = make_numeric_column(
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index d8c0e61aafe..87603e4c35b 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> replace_with_backrefs(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index e6f9cbc65c4..25417909c89 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -134,7 +134,7 @@ std::unique_ptr<column> replace_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
   if (patterns.empty())  // no patterns; just return a copy
     return std::make_unique<column>(strings.parent(), stream, mr);
 
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 1d5df3c0a29..4d32d91c1d4 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -405,8 +405,8 @@ std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings
 {
   auto const strings_count = strings.size();
   auto const offset_count  = strings_count + 1;
-  auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
-  auto const d_in_chars    = strings.chars().data<char>();
+  auto const d_offsets     = strings.offsets_begin();
+  auto const d_in_chars    = strings.chars_begin();
   auto const chars_bytes   = chars_end - chars_start;
   auto const target_size   = d_target.size_bytes();
 
@@ -547,10 +547,10 @@ std::unique_ptr<column> replace<replace_algorithm::AUTO>(strings_column_view con
                                                          rmm::cuda_stream_view stream,
                                                          rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
-  CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
 
   string_view d_target(target.data(), target.size());
@@ -586,10 +586,10 @@ std::unique_ptr<column> replace<replace_algorithm::CHAR_PARALLEL>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
-  CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
 
   string_view d_target(target.data(), target.size());
@@ -598,7 +598,7 @@ std::unique_ptr<column> replace<replace_algorithm::CHAR_PARALLEL>(
   // determine range of characters in the base column
   auto const strings_count = strings.size();
   auto const offset_count  = strings_count + 1;
-  auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
+  auto const d_offsets     = strings.offsets_begin();
   size_type chars_start    = (strings.offset() == 0) ? 0
                                                      : cudf::detail::get_value<int32_t>(
                                                       strings.offsets(), strings.offset(), stream);
@@ -619,10 +619,10 @@ std::unique_ptr<column> replace<replace_algorithm::ROW_PARALLEL>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
   if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
-  CUDF_EXPECTS(target.is_valid(), "Parameter target must be valid.");
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
+  CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
 
   string_view d_target(target.data(), target.size());
@@ -679,8 +679,8 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
   if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
 
   string_view d_repl(repl.data(), repl.size());
@@ -761,7 +761,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
                "Parameters targets must not be empty and must not have nulls");
   CUDF_EXPECTS(((repls.size() > 0) && (repls.null_count() == 0)),
@@ -790,8 +790,8 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid.");
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
 
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index b2f1ef37a74..b940944c186 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -108,9 +108,9 @@ std::unique_ptr<column> replace_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(repl.is_valid(), "Parameter repl must be valid");
+  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid");
   string_view d_repl(repl.data(), repl.size());
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index aa096f60333..eef26691319 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -182,7 +182,7 @@ std::unique_ptr<table> partition(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
   if (strings_count == 0) return std::make_unique<table>(std::vector<std::unique_ptr<column>>());
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -210,7 +210,7 @@ std::unique_ptr<table> rpartition(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
   if (strings_count == 0) return std::make_unique<table>(std::vector<std::unique_ptr<column>>());
   auto strings_column = column_device_view::create(strings.parent(), stream);
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 9c5be1c9ca3..5113b418501 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -426,12 +426,11 @@ std::unique_ptr<table> split_fn(strings_column_view const& strings_column,
   std::vector<std::unique_ptr<column>> results;
   auto const strings_count = strings_column.size();
   if (strings_count == 0) {
-    results.push_back(make_empty_column(data_type{type_id::STRING}));
+    results.push_back(make_empty_column(type_id::STRING));
     return std::make_unique<table>(std::move(results));
   }
 
-  auto d_offsets = strings_column.offsets().data<int32_t>();
-  d_offsets += strings_column.offset();  // nvbug-2808421 : do not combine with the previous line
+  auto d_offsets = strings_column.offsets_begin();
   auto const chars_bytes =
     cudf::detail::get_value<int32_t>(
       strings_column.offsets(), strings_column.offset() + strings_count, stream) -
@@ -800,7 +799,7 @@ std::unique_ptr<table> split(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   size_type max_tokens = 0;
   if (maxsplit > 0) max_tokens = maxsplit + 1;  // makes consistent with Pandas
@@ -825,7 +824,7 @@ std::unique_ptr<table> rsplit(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   size_type max_tokens = 0;
   if (maxsplit > 0) max_tokens = maxsplit + 1;  // makes consistent with Pandas
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index b780791c7a5..a31716ad2a2 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -266,7 +266,7 @@ std::unique_ptr<column> split_record(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   // makes consistent with Pandas
   size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
diff --git a/cpp/src/strings/strings_column_view.cu b/cpp/src/strings/strings_column_view.cpp
similarity index 75%
rename from cpp/src/strings/strings_column_view.cu
rename to cpp/src/strings/strings_column_view.cpp
index f6e64ded09b..6de478d3e1e 100644
--- a/cpp/src/strings/strings_column_view.cu
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -14,19 +14,9 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/get_value.cuh>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/transform.h>
-
 namespace cudf {
 //
 strings_column_view::strings_column_view(column_view strings_column) : column_view(strings_column)
@@ -42,6 +32,16 @@ column_view strings_column_view::offsets() const
   return child(offsets_column_index);
 }
 
+strings_column_view::offset_iterator strings_column_view::offsets_begin() const
+{
+  return offsets().begin<offset_type>() + offset();
+}
+
+strings_column_view::offset_iterator strings_column_view::offsets_end() const
+{
+  return offsets_begin() + size() + 1;
+}
+
 column_view strings_column_view::chars() const
 {
   CUDF_EXPECTS(num_children() > 0, "strings column has no children");
@@ -54,4 +54,14 @@ size_type strings_column_view::chars_size() const noexcept
   return chars().size();
 }
 
+strings_column_view::chars_iterator strings_column_view::chars_begin() const
+{
+  return chars().begin<char>();
+}
+
+strings_column_view::chars_iterator strings_column_view::chars_end() const
+{
+  return chars_begin() + chars_size();
+}
+
 }  // namespace cudf
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 0c4fe841401..2b1e6969956 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -100,9 +100,9 @@ std::unique_ptr<column> strip(
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(to_strip.is_valid(), "Parameter to_strip must be valid");
+  CUDF_EXPECTS(to_strip.is_valid(stream), "Parameter to_strip must be valid");
   string_view const d_to_strip(to_strip.data(), to_strip.size());
 
   auto const d_column = column_device_view::create(strings.parent(), stream);
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 82759a6c73f..7a193a16434 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -109,9 +109,9 @@ std::unique_ptr<column> slice_strings(
   rmm::cuda_stream_view stream           = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
-  if (step.is_valid()) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
+  if (step.is_valid(stream)) CUDF_EXPECTS(step.value(stream) != 0, "Step parameter must not be 0");
 
   auto const d_column = column_device_view::create(strings.parent(), stream);
   auto const d_start  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
@@ -295,7 +295,7 @@ std::unique_ptr<column> slice_strings(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(starts_column.size() == strings_count,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 6425085b5ad..ad3515e8058 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -87,7 +87,7 @@ std::unique_ptr<column> translate(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
-  if (strings.is_empty()) return make_empty_column(data_type{type_id::STRING});
+  if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index ca96e4d5f53..ce3c383352d 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -98,7 +98,7 @@ std::unique_ptr<column> wrap(
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
   auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(data_type{type_id::STRING});
+  if (strings_count == 0) return make_empty_column(type_id::STRING);
 
   auto strings_column  = column_device_view::create(strings.parent(), stream);
   auto d_column        = *strings_column;
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index fe5483b119d..7656470b791 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -21,8 +21,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.cuh>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <structs/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index 833ceab7518..ee401c82bcb 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-#include <structs/utilities.hpp>
-
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/types.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index b84af73b681..47f8f29385c 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -14,10 +14,9 @@
  * limitations under the License.
  */
 
-#include <thrust/iterator/counting_iterator.h>
-
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -26,9 +25,13 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
-#include <structs/utilities.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
+#include <iterator>
 
 namespace cudf {
 namespace structs {
@@ -85,30 +88,40 @@ bool is_or_has_nested_lists(cudf::column_view const& col)
  * @brief Flattens struct columns to constituent non-struct columns in the input table.
  *
  */
-struct flattened_table {
+struct table_flattener {
+  table_view input;
   // reference variables
-  table_view const& input;
   std::vector<order> const& column_order;
   std::vector<null_order> const& null_precedence;
   // output
   std::vector<std::unique_ptr<column>> validity_as_column;
+  std::vector<rmm::device_buffer> superimposed_nullmasks;
   std::vector<column_view> flat_columns;
   std::vector<order> flat_column_order;
   std::vector<null_order> flat_null_precedence;
   column_nullability nullability;
 
-  flattened_table(table_view const& input,
+  table_flattener(table_view const& input,
                   std::vector<order> const& column_order,
                   std::vector<null_order> const& null_precedence,
                   column_nullability nullability)
-    : input(input),
-      column_order(column_order),
-      null_precedence(null_precedence),
-      nullability(nullability)
+    : column_order(column_order), null_precedence(null_precedence), nullability(nullability)
   {
+    superimpose_nulls(input);
     fail_if_unsupported_types(input);
   }
 
+  /**
+   * @brief Pushes down nulls from struct columns to children, saves the resulting
+   * column to `input`, and generated null masks to `superimposed_nullmasks`.
+   */
+  void superimpose_nulls(table_view const& input_table)
+  {
+    auto [table, null_masks]     = superimpose_parent_nulls(input_table);
+    this->input                  = table;
+    this->superimposed_nullmasks = std::move(null_masks);
+  }
+
   void fail_if_unsupported_types(table_view const& input) const
   {
     auto const has_lists = std::any_of(input.begin(), input.end(), is_or_has_nested_lists);
@@ -176,29 +189,23 @@ struct flattened_table {
       }
     }
 
-    return std::make_tuple(table_view{flat_columns},
+    return flattened_table{table_view{flat_columns},
                            std::move(flat_column_order),
                            std::move(flat_null_precedence),
-                           std::move(validity_as_column));
+                           std::move(validity_as_column),
+                           std::move(superimposed_nullmasks)};
   }
 };
 
-std::tuple<table_view,
-           std::vector<order>,
-           std::vector<null_order>,
-           std::vector<std::unique_ptr<column>>>
-flatten_nested_columns(table_view const& input,
-                       std::vector<order> const& column_order,
-                       std::vector<null_order> const& null_precedence,
-                       column_nullability nullability)
+flattened_table flatten_nested_columns(table_view const& input,
+                                       std::vector<order> const& column_order,
+                                       std::vector<null_order> const& null_precedence,
+                                       column_nullability nullability)
 {
   auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
-  if (not has_struct) {
-    return std::make_tuple(
-      input, column_order, null_precedence, std::vector<std::unique_ptr<column>>{});
-  }
+  if (not has_struct) { return flattened_table{input, column_order, null_precedence, {}, {}}; }
 
-  return flattened_table{input, column_order, null_precedence, nullability}();
+  return table_flattener{input, column_order, null_precedence, nullability}();
 }
 
 namespace {
@@ -415,6 +422,21 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
                          std::move(ret_validity_buffers));
 }
 
+std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
+  table_view const& table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+{
+  auto superimposed_columns   = std::vector<column_view>{};
+  auto superimposed_nullmasks = std::vector<rmm::device_buffer>{};
+  for (auto col : table) {
+    auto [superimposed_col, null_masks] = superimpose_parent_nulls(col);
+    superimposed_columns.push_back(superimposed_col);
+    superimposed_nullmasks.insert(superimposed_nullmasks.begin(),
+                                  std::make_move_iterator(null_masks.begin()),
+                                  std::make_move_iterator(null_masks.end()));
+  }
+  return {table_view{superimposed_columns}, std::move(superimposed_nullmasks)};
+}
+
 }  // namespace detail
 }  // namespace structs
 }  // namespace cudf
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index fda2a2ca786..853b4820a5c 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -141,7 +141,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   CUDF_EXPECTS(row_indices.size() == strings.size(),
                "Parameter row_indices must be the same size as the input column");
   CUDF_EXPECTS(row_indices.has_nulls() == false, "Parameter row_indices must not have nulls");
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index b41fe6150e7..87c288691dd 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -86,7 +86,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   cudf::string_view const d_separator(separator.data(), separator.size());
   CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
 
@@ -100,7 +100,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   // first create a new offsets vector removing nulls and empty strings from the input column
   std::unique_ptr<cudf::column> non_empty_offsets_column = [&] {
     cudf::column_view offsets_view(
-      cudf::data_type{cudf::type_id::INT32}, strings_count + 1, strings.offsets().data<int32_t>());
+      cudf::data_type{cudf::type_id::INT32}, strings_count + 1, strings.offsets_begin());
     auto table_offsets = cudf::detail::copy_if(
                            cudf::table_view({offsets_view}),
                            [d_strings, strings_count] __device__(cudf::size_type idx) {
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 5361dae7b5f..03f66609e18 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -138,9 +138,9 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::cuda_stream_view stream         = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
-  CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be valid");
+  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   cudf::string_view d_separator(separator.data(), separator.size());
 
   CUDF_EXPECTS(ngrams >= 1, "Parameter ngrams should be an integer value of 1 or greater");
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index d7128803ca0..9ca39bca995 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -203,7 +203,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
   if (replacements.size() != 1)
     CUDF_EXPECTS(replacements.size() == targets.size(),
                  "Parameter targets and replacements must be the same size");
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   cudf::size_type const strings_count = strings.size();
   if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
@@ -239,8 +239,8 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(replacement.is_valid(), "Parameter replacement must be valid");
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
   cudf::size_type const strings_count = strings.size();
   if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index fc38a13334a..961797e188f 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -102,7 +102,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   return tokenize_fn(strings.size(), strings_tokenizer{*strings_column, d_delimiter}, stream, mr);
@@ -114,7 +114,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(delimiter.is_valid(), "Parameter delimiter must be valid");
+  CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   return token_count_fn(
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
new file mode 100644
index 00000000000..5b5c35df551
--- /dev/null
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/table/row_operators.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <algorithm>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+template <typename InputType, bool has_nulls>
+struct one_hot_encode_functor {
+  one_hot_encode_functor(column_device_view input, column_device_view category)
+    : _equality_comparator{input, category}, _input_size{input.size()}
+  {
+  }
+
+  bool __device__ operator()(size_type i)
+  {
+    size_type const element_index  = i % _input_size;
+    size_type const category_index = i / _input_size;
+    return _equality_comparator.template operator()<InputType>(element_index, category_index);
+  }
+
+ private:
+  element_equality_comparator<has_nulls> const _equality_comparator;
+  size_type const _input_size;
+};
+
+}  // anonymous namespace
+
+template <bool has_nulls>
+struct one_hot_encode_launcher {
+  template <typename InputType, CUDF_ENABLE_IF(is_equality_comparable<InputType, InputType>())>
+  std::pair<std::unique_ptr<column>, table_view> operator()(column_view const& input_column,
+                                                            column_view const& categories,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::mr::device_memory_resource* mr)
+  {
+    auto const total_size = input_column.size() * categories.size();
+    auto all_encodings    = make_numeric_column(
+      data_type{type_id::BOOL8}, total_size, mask_state::UNALLOCATED, stream, mr);
+
+    auto d_input_column    = column_device_view::create(input_column, stream);
+    auto d_category_column = column_device_view::create(categories, stream);
+    one_hot_encode_functor<InputType, has_nulls> one_hot_encoding_compute_f(*d_input_column,
+                                                                            *d_category_column);
+
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(total_size),
+                      all_encodings->mutable_view().begin<bool>(),
+                      one_hot_encoding_compute_f);
+
+    auto split_iter = make_counting_transform_iterator(
+      1, [width = input_column.size()](auto i) { return i * width; });
+    std::vector<size_type> split_indices(split_iter, split_iter + categories.size() - 1);
+
+    // TODO: use detail interface, gh9226
+    auto views = cudf::split(all_encodings->view(), split_indices);
+    table_view encodings_view{views};
+
+    return std::make_pair(std::move(all_encodings), encodings_view);
+  }
+
+  template <typename InputType,
+            typename... Args,
+            CUDF_ENABLE_IF(not is_equality_comparable<InputType, InputType>())>
+  std::pair<std::unique_ptr<column>, table_view> operator()(Args&&...)
+  {
+    CUDF_FAIL("Cannot encode column type without well-defined equality operator.");
+  }
+};
+
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
+                                                              column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
+
+  if (categories.is_empty()) {
+    return std::make_pair(make_empty_column(type_id::BOOL8), table_view{});
+  }
+
+  if (input.is_empty()) {
+    auto empty_data = make_empty_column(type_id::BOOL8);
+    std::vector<column_view> views(categories.size(), empty_data->view());
+    return std::make_pair(std::move(empty_data), table_view{views});
+  }
+
+  return (!input.nullable() && !categories.nullable())
+           ? type_dispatcher(
+               input.type(), one_hot_encode_launcher<false>{}, input, categories, stream, mr)
+           : type_dispatcher(
+               input.type(), one_hot_encode_launcher<true>{}, input, categories, stream, mr);
+}
+
+}  // namespace detail
+
+std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
+                                                              column_view const& categories,
+                                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::one_hot_encode(input, categories, rmm::cuda_stream_default, mr);
+}
+}  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 27936ce04b3..efa011ea4a6 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -468,7 +468,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::mr::device_memory_resource* mr)
 {
   // no rows
-  if (t.num_rows() <= 0) { return cudf::make_empty_column(data_type{type_id::INT32}); }
+  if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
 
   // flatten the hierarchy and determine some information about it.
   std::vector<cudf::column_view> cols;
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 5bc2cb21ac7..d119bc36c73 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -49,7 +49,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
   auto splits_iter   = thrust::make_transform_iterator(
     one_iter, [width = input.num_columns()](size_type idx) { return idx * width; });
   auto splits = std::vector<size_type>(splits_iter, splits_iter + input.num_rows() - 1);
-  auto output_column_views = cudf::split(output_column->view(), splits);
+  auto output_column_views = split(output_column->view(), splits, stream);
 
   return std::make_pair(std::move(output_column), table_view(output_column_views));
 }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d3475cbbed2..98bade7e15f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,142 +1,141 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-###################################################################################################
-# - compiler function -----------------------------------------------------------------------------
-
-function(ConfigureTest CMAKE_TEST_NAME )
-    add_executable(${CMAKE_TEST_NAME} ${ARGN})
-    set_target_properties(${CMAKE_TEST_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>")
-    target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main)
-    add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# ##################################################################################################
+# * compiler function -----------------------------------------------------------------------------
+
+# This function takes in a test name and test source and handles setting all of the associated
+# properties and linking to build the test
+function(ConfigureTest CMAKE_TEST_NAME)
+  add_executable(${CMAKE_TEST_NAME} ${ARGN})
+  set_target_properties(
+    ${CMAKE_TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
+                                  "$<BUILD_INTERFACE:${CUDF_BINARY_DIR}/gtests>"
+  )
+  target_link_libraries(${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main)
+  add_test(NAME ${CMAKE_TEST_NAME} COMMAND ${CMAKE_TEST_NAME})
 endfunction()
 
-###################################################################################################
-### test sources ##################################################################################
-###################################################################################################
-
-###################################################################################################
-# - column tests ----------------------------------------------------------------------------------
-ConfigureTest(COLUMN_TEST
-    column/bit_cast_test.cpp
-    column/column_view_shallow_test.cpp
-    column/column_test.cu
-    column/column_device_view_test.cu
-    column/compound_test.cu)
-
-###################################################################################################
-# - scalar tests ----------------------------------------------------------------------------------
-ConfigureTest(SCALAR_TEST
-    scalar/scalar_test.cpp
-    scalar/scalar_device_view_test.cu)
-
-###################################################################################################
-# - timestamps tests ------------------------------------------------------------------------------
+# ##################################################################################################
+# test sources ##################################################################################
+# ##################################################################################################
+
+# ##################################################################################################
+# * column tests ----------------------------------------------------------------------------------
+ConfigureTest(
+  COLUMN_TEST column/bit_cast_test.cpp column/column_view_shallow_test.cpp column/column_test.cu
+  column/column_device_view_test.cu column/compound_test.cu
+)
+
+# ##################################################################################################
+# * scalar tests ----------------------------------------------------------------------------------
+ConfigureTest(SCALAR_TEST scalar/scalar_test.cpp scalar/scalar_device_view_test.cu)
+
+# ##################################################################################################
+# * timestamps tests ------------------------------------------------------------------------------
 ConfigureTest(TIMESTAMPS_TEST wrappers/timestamps_test.cu)
 
-###################################################################################################
-# - cudf tests ------------------------------------------------------------------------------------
+# ##################################################################################################
+# * cudf tests ------------------------------------------------------------------------------------
 ConfigureTest(ERROR_TEST error/error_handling_test.cu)
 
-###################################################################################################
-# - groupby tests ---------------------------------------------------------------------------------
-ConfigureTest(GROUPBY_TEST
-    groupby/argmin_tests.cpp
-    groupby/argmax_tests.cpp
-    groupby/collect_list_tests.cpp
-    groupby/collect_set_tests.cpp
-    groupby/count_scan_tests.cpp
-    groupby/count_tests.cpp
-    groupby/groups_tests.cpp
-    groupby/keys_tests.cpp
-    groupby/lists_tests.cpp
-    groupby/m2_tests.cpp
-    groupby/min_tests.cpp
-    groupby/max_scan_tests.cpp
-    groupby/max_tests.cpp
-    groupby/mean_tests.cpp
-    groupby/median_tests.cpp
-    groupby/merge_m2_tests.cpp
-    groupby/merge_lists_tests.cpp
-    groupby/merge_sets_tests.cpp
-    groupby/min_scan_tests.cpp
-    groupby/nth_element_tests.cpp
-    groupby/nunique_tests.cpp
-    groupby/product_tests.cpp
-    groupby/quantile_tests.cpp
-    groupby/rank_scan_tests.cpp
-    groupby/replace_nulls_tests.cpp
-    groupby/shift_tests.cpp
-    groupby/std_tests.cpp
-    groupby/structs_tests.cpp
-    groupby/sum_of_squares_tests.cpp
-    groupby/sum_scan_tests.cpp
-    groupby/sum_tests.cpp
-    groupby/tdigest_tests.cu
-    groupby/var_tests.cpp)
-
-###################################################################################################
-# - join tests ------------------------------------------------------------------------------------
-ConfigureTest(JOIN_TEST
-    join/join_tests.cpp
-    join/conditional_join_tests.cu
-    join/cross_join_tests.cpp
-    join/semi_anti_join_tests.cpp)
-
-###################################################################################################
-# - is_sorted tests -------------------------------------------------------------------------------
+# ##################################################################################################
+# * groupby tests ---------------------------------------------------------------------------------
+ConfigureTest(
+  GROUPBY_TEST
+  groupby/argmin_tests.cpp
+  groupby/argmax_tests.cpp
+  groupby/collect_list_tests.cpp
+  groupby/collect_set_tests.cpp
+  groupby/correlation_tests.cpp
+  groupby/count_scan_tests.cpp
+  groupby/count_tests.cpp
+  groupby/covariance_tests.cpp
+  groupby/groups_tests.cpp
+  groupby/keys_tests.cpp
+  groupby/lists_tests.cpp
+  groupby/m2_tests.cpp
+  groupby/min_tests.cpp
+  groupby/max_scan_tests.cpp
+  groupby/max_tests.cpp
+  groupby/mean_tests.cpp
+  groupby/median_tests.cpp
+  groupby/merge_m2_tests.cpp
+  groupby/merge_lists_tests.cpp
+  groupby/merge_sets_tests.cpp
+  groupby/min_scan_tests.cpp
+  groupby/nth_element_tests.cpp
+  groupby/nunique_tests.cpp
+  groupby/product_tests.cpp
+  groupby/quantile_tests.cpp
+  groupby/rank_scan_tests.cpp
+  groupby/replace_nulls_tests.cpp
+  groupby/shift_tests.cpp
+  groupby/std_tests.cpp
+  groupby/structs_tests.cpp
+  groupby/sum_of_squares_tests.cpp
+  groupby/sum_scan_tests.cpp
+  groupby/sum_tests.cpp
+  groupby/tdigest_tests.cu
+  groupby/var_tests.cpp
+)
+
+# ##################################################################################################
+# * join tests ------------------------------------------------------------------------------------
+ConfigureTest(
+  JOIN_TEST join/join_tests.cpp join/conditional_join_tests.cu join/cross_join_tests.cpp
+  join/semi_anti_join_tests.cpp
+)
+
+# ##################################################################################################
+# * is_sorted tests -------------------------------------------------------------------------------
 ConfigureTest(IS_SORTED_TEST sort/is_sorted_tests.cpp)
 
-###################################################################################################
-# - datetime tests --------------------------------------------------------------------------------
+# ##################################################################################################
+# * datetime tests --------------------------------------------------------------------------------
 ConfigureTest(DATETIME_OPS_TEST datetime/datetime_ops_test.cpp)
 
-###################################################################################################
-# - hashing tests ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * hashing tests ---------------------------------------------------------------------------------
 ConfigureTest(HASHING_TEST hashing/hash_test.cpp)
 
-###################################################################################################
-# - partitioning tests ----------------------------------------------------------------------------
-ConfigureTest(PARTITIONING_TEST
-    partitioning/hash_partition_test.cpp
-    partitioning/round_robin_test.cpp
-    partitioning/partition_test.cpp)
-
-###################################################################################################
-# - hash_map tests --------------------------------------------------------------------------------
-ConfigureTest(HASH_MAP_TEST
-    hash_map/map_test.cu
-    hash_map/multimap_test.cu)
-
-###################################################################################################
-# - quantiles tests -------------------------------------------------------------------------------
-ConfigureTest(QUANTILES_TEST
-    quantiles/percentile_approx_test.cu
-    quantiles/quantile_test.cpp
-    quantiles/quantiles_test.cpp)
-
-###################################################################################################
-# - reduction tests -------------------------------------------------------------------------------
-ConfigureTest(REDUCTION_TEST
-    reductions/reduction_tests.cpp
-    reductions/scan_tests.cpp)
-
-###################################################################################################
-# - replace tests ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * partitioning tests ----------------------------------------------------------------------------
+ConfigureTest(
+  PARTITIONING_TEST partitioning/hash_partition_test.cpp partitioning/round_robin_test.cpp
+  partitioning/partition_test.cpp
+)
+
+# ##################################################################################################
+# * hash_map tests --------------------------------------------------------------------------------
+ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu hash_map/multimap_test.cu)
+
+# ##################################################################################################
+# * quantiles tests -------------------------------------------------------------------------------
+ConfigureTest(
+  QUANTILES_TEST quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp
+  quantiles/quantiles_test.cpp
+)
+
+# ##################################################################################################
+# * reduction tests -------------------------------------------------------------------------------
+ConfigureTest(
+  REDUCTION_TEST reductions/rank_tests.cpp reductions/reduction_tests.cpp reductions/scan_tests.cpp
+)
+
+# ##################################################################################################
+# * replace tests ---------------------------------------------------------------------------------
 ConfigureTest(REPLACE_TEST replace/replace_tests.cpp)
 
 ConfigureTest(REPLACE_NULLS_TEST replace/replace_nulls_tests.cpp)
@@ -147,52 +146,50 @@ ConfigureTest(NORMALIZE_REPLACE_TEST replace/normalize_replace_tests.cpp)
 
 ConfigureTest(CLAMP_TEST replace/clamp_test.cpp)
 
-###################################################################################################
-# - fixed_point tests -----------------------------------------------------------------------------
-ConfigureTest(FIXED_POINT_TEST
-    fixed_point/fixed_point_tests.cpp
-    fixed_point/fixed_point_tests.cu)
-
-###################################################################################################
-# - unary tests -----------------------------------------------------------------------------------
-ConfigureTest(UNARY_TEST
-    unary/math_ops_test.cpp
-    unary/unary_ops_test.cpp
-    unary/cast_tests.cpp)
-
-###################################################################################################
-# - round tests -----------------------------------------------------------------------------------
+# ##################################################################################################
+# * fixed_point tests -----------------------------------------------------------------------------
+ConfigureTest(FIXED_POINT_TEST fixed_point/fixed_point_tests.cpp fixed_point/fixed_point_tests.cu)
+
+# ##################################################################################################
+# * unary tests -----------------------------------------------------------------------------------
+ConfigureTest(UNARY_TEST unary/math_ops_test.cpp unary/unary_ops_test.cpp unary/cast_tests.cpp)
+
+# ##################################################################################################
+# * round tests -----------------------------------------------------------------------------------
 ConfigureTest(ROUND_TEST round/round_tests.cpp)
 
-###################################################################################################
-# - binary tests ----------------------------------------------------------------------------------
-ConfigureTest(BINARY_TEST
-    binaryop/binop-verify-input-test.cpp
-    binaryop/binop-null-test.cpp
-    binaryop/binop-integration-test.cpp
-    binaryop/binop-compiled-test.cpp
-    binaryop/binop-compiled-fixed_point-test.cpp
-    binaryop/binop-generic-ptx-test.cpp
-    )
-
-###################################################################################################
-# - unary transform tests -------------------------------------------------------------------------
-ConfigureTest(TRANSFORM_TEST
-    transform/integration/unary-transform-test.cpp
-    transform/nans_to_null_test.cpp
-    transform/mask_to_bools_test.cpp
-    transform/bools_to_mask_test.cpp
-    transform/row_bit_count_test.cu)
-
-###################################################################################################
-# - interop tests -------------------------------------------------------------------------
-ConfigureTest(INTEROP_TEST
-    interop/to_arrow_test.cpp
-    interop/from_arrow_test.cpp
-    interop/dlpack_test.cpp)
-
-###################################################################################################
-# - io tests --------------------------------------------------------------------------------------
+# ##################################################################################################
+# * binary tests ----------------------------------------------------------------------------------
+ConfigureTest(
+  BINARY_TEST
+  binaryop/binop-verify-input-test.cpp
+  binaryop/binop-null-test.cpp
+  binaryop/binop-integration-test.cpp
+  binaryop/binop-compiled-test.cpp
+  binaryop/binop-compiled-fixed_point-test.cpp
+  binaryop/binop-generic-ptx-test.cpp
+)
+
+# ##################################################################################################
+# * unary transform tests -------------------------------------------------------------------------
+ConfigureTest(
+  TRANSFORM_TEST
+  transform/integration/unary-transform-test.cpp
+  transform/nans_to_null_test.cpp
+  transform/mask_to_bools_test.cpp
+  transform/bools_to_mask_test.cpp
+  transform/row_bit_count_test.cu
+  transform/one_hot_encode_tests.cpp
+)
+
+# ##################################################################################################
+# * interop tests -------------------------------------------------------------------------
+ConfigureTest(
+  INTEROP_TEST interop/to_arrow_test.cpp interop/from_arrow_test.cpp interop/dlpack_test.cpp
+)
+
+# ##################################################################################################
+# * io tests --------------------------------------------------------------------------------------
 ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 
 ConfigureTest(CSV_TEST io/csv_test.cpp)
@@ -205,253 +202,256 @@ if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
 
-###################################################################################################
-# - sort tests ------------------------------------------------------------------------------------
-ConfigureTest(SORT_TEST
-    sort/segmented_sort_tests.cpp
-    sort/sort_test.cpp
-    sort/rank_test.cpp)
-
-###################################################################################################
-# - copying tests ---------------------------------------------------------------------------------
-ConfigureTest(COPYING_TEST
-    copying/concatenate_tests.cu
-    copying/copy_if_else_nested_tests.cpp
-    copying/copy_range_tests.cpp
-    copying/copy_tests.cu
-    copying/detail_gather_tests.cu
-    copying/gather_list_tests.cpp
-    copying/gather_str_tests.cpp
-    copying/gather_struct_tests.cpp
-    copying/gather_tests.cpp
-    copying/get_value_tests.cpp
-    copying/pack_tests.cpp
-    copying/sample_tests.cpp
-    copying/scatter_tests.cpp
-    copying/scatter_list_tests.cpp
-    copying/scatter_list_scalar_tests.cpp
-    copying/scatter_struct_tests.cpp
-    copying/scatter_struct_scalar_tests.cpp
-    copying/segmented_gather_list_tests.cpp
-    copying/shift_tests.cpp
-    copying/slice_tests.cpp
-    copying/split_tests.cpp
-    copying/utility_tests.cpp
-    copying/reverse_tests.cpp)
-
-###################################################################################################
-# - utilities tests -------------------------------------------------------------------------------
-ConfigureTest(UTILITIES_TEST
-    utilities_tests/type_list_tests.cpp
-    utilities_tests/column_utilities_tests.cpp
-    utilities_tests/column_wrapper_tests.cpp
-    utilities_tests/lists_column_wrapper_tests.cpp
-    utilities_tests/default_stream_tests.cpp
-    utilities_tests/type_check_tests.cpp)
-
-###################################################################################################
-# - span tests -------------------------------------------------------------------------------
+# ##################################################################################################
+# * sort tests ------------------------------------------------------------------------------------
+ConfigureTest(SORT_TEST sort/segmented_sort_tests.cpp sort/sort_test.cpp sort/rank_test.cpp)
+
+# ##################################################################################################
+# * copying tests ---------------------------------------------------------------------------------
+ConfigureTest(
+  COPYING_TEST
+  copying/concatenate_tests.cu
+  copying/copy_if_else_nested_tests.cpp
+  copying/copy_range_tests.cpp
+  copying/copy_tests.cpp
+  copying/detail_gather_tests.cu
+  copying/gather_list_tests.cpp
+  copying/gather_str_tests.cpp
+  copying/gather_struct_tests.cpp
+  copying/gather_tests.cpp
+  copying/get_value_tests.cpp
+  copying/pack_tests.cpp
+  copying/sample_tests.cpp
+  copying/scatter_tests.cpp
+  copying/scatter_list_tests.cpp
+  copying/scatter_list_scalar_tests.cpp
+  copying/scatter_struct_tests.cpp
+  copying/scatter_struct_scalar_tests.cpp
+  copying/segmented_gather_list_tests.cpp
+  copying/shift_tests.cpp
+  copying/slice_tests.cpp
+  copying/split_tests.cpp
+  copying/utility_tests.cpp
+  copying/reverse_tests.cpp
+)
+
+# ##################################################################################################
+# * utilities tests -------------------------------------------------------------------------------
+ConfigureTest(
+  UTILITIES_TEST
+  utilities_tests/type_list_tests.cpp
+  utilities_tests/column_utilities_tests.cpp
+  utilities_tests/column_wrapper_tests.cpp
+  utilities_tests/lists_column_wrapper_tests.cpp
+  utilities_tests/default_stream_tests.cpp
+  utilities_tests/type_check_tests.cpp
+)
+
+# ##################################################################################################
+# * span tests -------------------------------------------------------------------------------
 ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu)
 
-###################################################################################################
-# - iterator tests --------------------------------------------------------------------------------
-ConfigureTest(ITERATOR_TEST
-    iterator/value_iterator.cpp
-    iterator/value_iterator_test_chrono.cu
-    iterator/value_iterator_test_numeric.cu
-    iterator/value_iterator_test_strings.cu
-    iterator/value_iterator_test_transform.cu
-    iterator/pair_iterator_test_chrono.cu
-    iterator/pair_iterator_test_numeric.cu
-    iterator/scalar_iterator_test.cu
-    iterator/optional_iterator_test_chrono.cu
-    iterator/optional_iterator_test_numeric.cu
-    iterator/indexalator_test.cu
-    )
-
-###################################################################################################
-# - device atomics tests --------------------------------------------------------------------------
+# ##################################################################################################
+# * iterator tests --------------------------------------------------------------------------------
+ConfigureTest(
+  ITERATOR_TEST
+  iterator/value_iterator.cpp
+  iterator/value_iterator_test_chrono.cu
+  iterator/value_iterator_test_numeric.cu
+  iterator/value_iterator_test_strings.cu
+  iterator/value_iterator_test_transform.cu
+  iterator/pair_iterator_test_chrono.cu
+  iterator/pair_iterator_test_numeric.cu
+  iterator/scalar_iterator_test.cu
+  iterator/optional_iterator_test_chrono.cu
+  iterator/optional_iterator_test_numeric.cu
+  iterator/indexalator_test.cu
+)
+
+# ##################################################################################################
+# * device atomics tests --------------------------------------------------------------------------
 ConfigureTest(DEVICE_ATOMICS_TEST device_atomics/device_atomics_test.cu)
 
-###################################################################################################
-# - transpose tests -------------------------------------------------------------------------------
+# ##################################################################################################
+# * transpose tests -------------------------------------------------------------------------------
 ConfigureTest(TRANSPOSE_TEST transpose/transpose_test.cpp)
 
-###################################################################################################
-# - table tests -----------------------------------------------------------------------------------
-ConfigureTest(TABLE_TEST
-    table/table_tests.cpp
-    table/table_view_tests.cu
-    table/row_operators_tests.cpp)
-
-###################################################################################################
-# - sorted-merge tests ----------------------------------------------------------------------------
-ConfigureTest(MERGE_TEST
-    merge/merge_test.cpp
-    merge/merge_dictionary_test.cpp
-    merge/merge_string_test.cpp)
-
-###################################################################################################
-# - stream compaction tests -----------------------------------------------------------------------
-ConfigureTest(STREAM_COMPACTION_TEST
-    stream_compaction/apply_boolean_mask_tests.cpp
-    stream_compaction/drop_nulls_tests.cpp
-    stream_compaction/drop_nans_tests.cpp
-    stream_compaction/drop_duplicates_tests.cpp)
-
-###################################################################################################
-# - rolling tests ---------------------------------------------------------------------------------
-ConfigureTest(ROLLING_TEST
-    rolling/collect_ops_test.cpp
-    rolling/empty_input_test.cpp
-    rolling/grouped_rolling_test.cpp
-    rolling/lead_lag_test.cpp
-    rolling/range_rolling_window_test.cpp
-    rolling/range_window_bounds_test.cpp
-    rolling/rolling_test.cpp)
-
-###################################################################################################
-# - filling test ----------------------------------------------------------------------------------
-ConfigureTest(FILLING_TEST
-    filling/fill_tests.cpp
-    filling/repeat_tests.cpp
-    filling/sequence_tests.cpp)
-
-###################################################################################################
-# - search test -----------------------------------------------------------------------------------
-ConfigureTest(SEARCH_TEST
-    search/search_dictionary_test.cpp
-    search/search_struct_test.cpp
-    search/search_test.cpp)
-
-###################################################################################################
-# - reshape test ----------------------------------------------------------------------------------
-ConfigureTest(RESHAPE_TEST
-    reshape/byte_cast_tests.cpp
-    reshape/interleave_columns_tests.cpp
-    reshape/tile_tests.cpp)
-
-###################################################################################################
-# - traits test -----------------------------------------------------------------------------------
+# ##################################################################################################
+# * table tests -----------------------------------------------------------------------------------
+ConfigureTest(
+  TABLE_TEST table/table_tests.cpp table/table_view_tests.cu table/row_operators_tests.cpp
+)
+
+# ##################################################################################################
+# * sorted-merge tests ----------------------------------------------------------------------------
+ConfigureTest(
+  MERGE_TEST merge/merge_test.cpp merge/merge_dictionary_test.cpp merge/merge_string_test.cpp
+)
+
+# ##################################################################################################
+# * stream compaction tests -----------------------------------------------------------------------
+ConfigureTest(
+  STREAM_COMPACTION_TEST
+  stream_compaction/apply_boolean_mask_tests.cpp
+  stream_compaction/drop_nulls_tests.cpp
+  stream_compaction/drop_nans_tests.cpp
+  stream_compaction/drop_duplicates_tests.cpp
+)
+
+# ##################################################################################################
+# * rolling tests ---------------------------------------------------------------------------------
+ConfigureTest(
+  ROLLING_TEST
+  rolling/collect_ops_test.cpp
+  rolling/empty_input_test.cpp
+  rolling/grouped_rolling_test.cpp
+  rolling/lead_lag_test.cpp
+  rolling/range_rolling_window_test.cpp
+  rolling/range_window_bounds_test.cpp
+  rolling/rolling_test.cpp
+)
+
+# ##################################################################################################
+# * filling test ----------------------------------------------------------------------------------
+ConfigureTest(
+  FILLING_TEST filling/fill_tests.cpp filling/repeat_tests.cpp filling/sequence_tests.cpp
+)
+
+# ##################################################################################################
+# * search test -----------------------------------------------------------------------------------
+ConfigureTest(
+  SEARCH_TEST search/search_dictionary_test.cpp search/search_struct_test.cpp
+  search/search_test.cpp
+)
+
+# ##################################################################################################
+# * reshape test ----------------------------------------------------------------------------------
+ConfigureTest(
+  RESHAPE_TEST reshape/byte_cast_tests.cpp reshape/interleave_columns_tests.cpp
+  reshape/tile_tests.cpp
+)
+
+# ##################################################################################################
+# * traits test -----------------------------------------------------------------------------------
 ConfigureTest(TRAITS_TEST types/traits_test.cpp)
 
-###################################################################################################
-# - factories test --------------------------------------------------------------------------------
-ConfigureTest(FACTORIES_TEST
-    scalar/factories_test.cpp
-    column/factories_test.cpp)
+# ##################################################################################################
+# * factories test --------------------------------------------------------------------------------
+ConfigureTest(FACTORIES_TEST scalar/factories_test.cpp column/factories_test.cpp)
 
-###################################################################################################
-# - dispatcher test -------------------------------------------------------------------------------
+# ##################################################################################################
+# * dispatcher test -------------------------------------------------------------------------------
 ConfigureTest(DISPATCHER_TEST types/type_dispatcher_test.cu)
 
-###################################################################################################
-# - strings test ----------------------------------------------------------------------------------
-ConfigureTest(STRINGS_TEST
-    strings/array_tests.cu
-    strings/attrs_tests.cpp
-    strings/booleans_tests.cpp
-    strings/case_tests.cpp
-    strings/chars_types_tests.cpp
-    strings/combine/concatenate_tests.cpp
-    strings/combine/join_list_elements_tests.cpp
-    strings/combine/join_strings_tests.cpp
-    strings/concatenate_tests.cpp
-    strings/contains_tests.cpp
-    strings/datetime_tests.cpp
-    strings/durations_tests.cpp
-    strings/extract_tests.cpp
-    strings/factories_test.cu
-    strings/fill_tests.cpp
-    strings/findall_tests.cpp
-    strings/find_tests.cpp
-    strings/find_multiple_tests.cpp
-    strings/fixed_point_tests.cpp
-    strings/floats_tests.cpp
-    strings/hash_string.cu
-    strings/integers_tests.cpp
-    strings/ipv4_tests.cpp
-    strings/json_tests.cpp
-    strings/pad_tests.cpp
-    strings/repeat_strings_tests.cpp
-    strings/replace_regex_tests.cpp
-    strings/replace_tests.cpp
-    strings/split_tests.cpp
-    strings/strip_tests.cpp
-    strings/substring_tests.cpp
-    strings/translate_tests.cpp
-    strings/urls_tests.cpp)
-
-###################################################################################################
-# - structs test ----------------------------------------------------------------------------------
-ConfigureTest(STRUCTS_TEST 
-    structs/structs_column_tests.cpp
-    structs/utilities_tests.cpp
-    )
-
-###################################################################################################
-# - nvtext test -----------------------------------------------------------------------------------
-ConfigureTest(TEXT_TEST
-    text/edit_distance_tests.cpp
-    text/ngrams_tests.cpp
-    text/ngrams_tokenize_tests.cpp
-    text/normalize_tests.cpp
-    text/replace_tests.cpp
-    text/stemmer_tests.cpp
-    text/subword_tests.cpp
-    text/tokenize_tests.cpp)
-
-###################################################################################################
-# - bitmask tests ---------------------------------------------------------------------------------
-ConfigureTest(BITMASK_TEST
-    bitmask/valid_if_tests.cu
-    bitmask/set_nullmask_tests.cu
-    bitmask/bitmask_tests.cpp
-    bitmask/is_element_valid_tests.cpp)
-
-
-###################################################################################################
-# - dictionary tests ------------------------------------------------------------------------------
-ConfigureTest(DICTIONARY_TEST
-    dictionary/add_keys_test.cpp
-    dictionary/decode_test.cpp
-    dictionary/encode_test.cpp
-    dictionary/factories_test.cpp
-    dictionary/fill_test.cpp
-    dictionary/gather_test.cpp
-    dictionary/remove_keys_test.cpp
-    dictionary/scatter_test.cpp
-    dictionary/search_test.cpp
-    dictionary/set_keys_test.cpp
-    dictionary/slice_test.cpp)
-
-###################################################################################################
-# - encode tests -----------------------------------------------------------------------------------
+# ##################################################################################################
+# * strings test ----------------------------------------------------------------------------------
+ConfigureTest(
+  STRINGS_TEST
+  strings/array_tests.cpp
+  strings/attrs_tests.cpp
+  strings/booleans_tests.cpp
+  strings/case_tests.cpp
+  strings/chars_types_tests.cpp
+  strings/combine/concatenate_tests.cpp
+  strings/combine/join_list_elements_tests.cpp
+  strings/combine/join_strings_tests.cpp
+  strings/concatenate_tests.cpp
+  strings/contains_tests.cpp
+  strings/datetime_tests.cpp
+  strings/durations_tests.cpp
+  strings/extract_tests.cpp
+  strings/factories_test.cu
+  strings/fill_tests.cpp
+  strings/findall_tests.cpp
+  strings/find_tests.cpp
+  strings/find_multiple_tests.cpp
+  strings/fixed_point_tests.cpp
+  strings/floats_tests.cpp
+  strings/format_lists_tests.cpp
+  strings/integers_tests.cpp
+  strings/ipv4_tests.cpp
+  strings/json_tests.cpp
+  strings/pad_tests.cpp
+  strings/repeat_strings_tests.cpp
+  strings/replace_regex_tests.cpp
+  strings/replace_tests.cpp
+  strings/split_tests.cpp
+  strings/strip_tests.cpp
+  strings/substring_tests.cpp
+  strings/translate_tests.cpp
+  strings/urls_tests.cpp
+)
+
+# ##################################################################################################
+# * structs test ----------------------------------------------------------------------------------
+ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_tests.cpp)
+
+# ##################################################################################################
+# * nvtext test -----------------------------------------------------------------------------------
+ConfigureTest(
+  TEXT_TEST
+  text/edit_distance_tests.cpp
+  text/ngrams_tests.cpp
+  text/ngrams_tokenize_tests.cpp
+  text/normalize_tests.cpp
+  text/replace_tests.cpp
+  text/stemmer_tests.cpp
+  text/subword_tests.cpp
+  text/tokenize_tests.cpp
+)
+
+# ##################################################################################################
+# * bitmask tests ---------------------------------------------------------------------------------
+ConfigureTest(
+  BITMASK_TEST bitmask/valid_if_tests.cu bitmask/set_nullmask_tests.cu bitmask/bitmask_tests.cpp
+  bitmask/is_element_valid_tests.cpp
+)
+
+# ##################################################################################################
+# * dictionary tests ------------------------------------------------------------------------------
+ConfigureTest(
+  DICTIONARY_TEST
+  dictionary/add_keys_test.cpp
+  dictionary/decode_test.cpp
+  dictionary/encode_test.cpp
+  dictionary/factories_test.cpp
+  dictionary/fill_test.cpp
+  dictionary/gather_test.cpp
+  dictionary/remove_keys_test.cpp
+  dictionary/scatter_test.cpp
+  dictionary/search_test.cpp
+  dictionary/set_keys_test.cpp
+  dictionary/slice_test.cpp
+)
+
+# ##################################################################################################
+# * encode tests -----------------------------------------------------------------------------------
 ConfigureTest(ENCODE_TEST encode/encode_tests.cpp)
 
-###################################################################################################
-# - ast tests -------------------------------------------------------------------------------------
+# ##################################################################################################
+# * ast tests -------------------------------------------------------------------------------------
 ConfigureTest(AST_TEST ast/transform_tests.cpp)
 
-###################################################################################################
-# - lists tests ----------------------------------------------------------------------------------
-ConfigureTest(LISTS_TEST
-    lists/combine/concatenate_list_elements_tests.cpp
-    lists/combine/concatenate_rows_tests.cpp
-    lists/contains_tests.cpp
-    lists/count_elements_tests.cpp
-    lists/drop_list_duplicates_tests.cpp
-    lists/explode_tests.cpp
-    lists/extract_tests.cpp
-    lists/sort_lists_tests.cpp)
-
-###################################################################################################
-# - bin tests ----------------------------------------------------------------------------------
-ConfigureTest(LABEL_BINS_TEST
-    labeling/label_bins_tests.cpp)
-
-###################################################################################################
-### enable testing ################################################################################
-###################################################################################################
+# ##################################################################################################
+# * lists tests ----------------------------------------------------------------------------------
+ConfigureTest(
+  LISTS_TEST
+  lists/combine/concatenate_list_elements_tests.cpp
+  lists/combine/concatenate_rows_tests.cpp
+  lists/contains_tests.cpp
+  lists/count_elements_tests.cpp
+  lists/drop_list_duplicates_tests.cpp
+  lists/explode_tests.cpp
+  lists/extract_tests.cpp
+  lists/sort_lists_tests.cpp
+)
+
+# ##################################################################################################
+# * bin tests ----------------------------------------------------------------------------------
+ConfigureTest(LABEL_BINS_TEST labeling/label_bins_tests.cpp)
+
+# ##################################################################################################
+# enable testing ################################################################################
+# ##################################################################################################
 
 enable_testing()
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 175918a0846..8cfd6d24fae 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -109,6 +109,23 @@ TEST_F(TransformTest, BasicAddition)
   cudf::test::expect_columns_equal(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionCast)
+{
+  auto c_0   = column_wrapper<int64_t>{3, 20, 1, 50};
+  auto c_1   = column_wrapper<int8_t>{10, 7, 20, 0};
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_INT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+
+  auto expected = column_wrapper<int64_t>{13, 27, 21, 50};
+  auto result   = cudf::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, BasicEquality)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index a6477247356..7925f0dd618 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -38,7 +38,7 @@ struct FixedPointCompiledTestBothReps : public cudf::test::BaseFixture {
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointCompiledTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointCompiledTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointCompiledTestBothReps, FixedPointBinaryOpAdd)
 {
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 25d2f1d2c24..a177a8378b7 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -125,7 +125,7 @@ using Add_types =
 template <typename T>
 struct BinaryOperationCompiledTest_Add : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Add, Add_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Add, Add_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Add, Vector_Vector)
 {
@@ -149,7 +149,7 @@ using Sub_types =
 template <typename T>
 struct BinaryOperationCompiledTest_Sub : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Sub, Sub_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Sub, Sub_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Sub, Vector_Vector)
 {
@@ -172,7 +172,7 @@ using Mul_types =
 template <typename T>
 struct BinaryOperationCompiledTest_Mul : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Mul, Mul_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Mul, Mul_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Mul, Vector_Vector)
 {
@@ -197,7 +197,7 @@ using Div_types =
 template <typename T>
 struct BinaryOperationCompiledTest_Div : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Div, Div_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Div, Div_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Div, Vector_Vector)
 {
@@ -219,7 +219,7 @@ using TrueDiv_types =
 template <typename T>
 struct BinaryOperationCompiledTest_TrueDiv : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_TrueDiv, TrueDiv_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_TrueDiv, TrueDiv_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_TrueDiv, Vector_Vector)
 {
@@ -249,7 +249,7 @@ using Mod_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_int6
 template <typename T>
 struct BinaryOperationCompiledTest_Mod : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Mod, Mod_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Mod, Mod_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Mod, Vector_Vector)
 {
@@ -268,7 +268,7 @@ using PyMod_types = cudf::test::Types<cudf::test::Types<int16_t, u_int64_t, u_in
 template <typename T>
 struct BinaryOperationCompiledTest_PyMod : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_PyMod, PyMod_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_PyMod, PyMod_types);
 TYPED_TEST(BinaryOperationCompiledTest_PyMod, Vector_Vector)
 {
   this->template test<cudf::library::operation::PyMod>(cudf::binary_operator::PYMOD);
@@ -292,7 +292,7 @@ using Pow_types = cudf::test::Types<cudf::test::Types<double, int64_t, int64_t>,
 template <typename T>
 struct BinaryOperationCompiledTest_FloatOps : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_FloatOps, Pow_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_FloatOps, Pow_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_FloatOps, Pow_Vector_Vector)
 {
@@ -395,7 +395,7 @@ using Bit_types = cudf::test::Types<cudf::test::Types<int16_t, int8_t, int16_t>,
 template <typename T>
 struct BinaryOperationCompiledTest_Bit : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Bit, Bit_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Bit, Bit_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Bit, BitwiseAnd_Vector_Vector)
 {
@@ -444,7 +444,7 @@ using Logical_types = cudf::test::Types<cudf::test::Types<bool, int8_t, int16_t>
 template <typename T>
 struct BinaryOperationCompiledTest_Logical : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Logical, Logical_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Logical, Logical_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Logical, LogicalAnd_Vector_Vector)
 {
@@ -472,7 +472,7 @@ using Comparison_types =
 template <typename T>
 struct BinaryOperationCompiledTest_Comparison : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_Comparison, Comparison_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_Comparison, Comparison_types);
 
 TYPED_TEST(BinaryOperationCompiledTest_Comparison, Equal_Vector_Vector)
 {
@@ -526,7 +526,7 @@ using Null_types =
 template <typename T>
 struct BinaryOperationCompiledTest_NullOps : public BinaryOperationCompiledTest<T> {
 };
-TYPED_TEST_CASE(BinaryOperationCompiledTest_NullOps, Null_types);
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_NullOps, Null_types);
 
 template <typename T>
 using column_wrapper = std::conditional_t<std::is_same_v<T, std::string>,
diff --git a/cpp/tests/binaryop/binop-integration-test.cpp b/cpp/tests/binaryop/binop-integration-test.cpp
index ec011a84037..9cdd03fdd62 100644
--- a/cpp/tests/binaryop/binop-integration-test.cpp
+++ b/cpp/tests/binaryop/binop-integration-test.cpp
@@ -2023,7 +2023,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointBinaryOpAdd)
 {
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index 6bf8839aec4..c6e6082d4be 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -55,7 +55,7 @@ template <typename T>
 struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes);
 
 template <typename FromType, typename ToType, typename Iterator>
 void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end)
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index 909b8fc1b6b..5d7d570b321 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -59,7 +59,7 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   rmm::device_buffer all_null_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_NULL)};
 };
 
-TYPED_TEST_CASE(TypedColumnTest, cudf::test::Types<int32_t>);
+TYPED_TEST_SUITE(TypedColumnTest, cudf::test::Types<int32_t>);
 
 /**
  * @brief Verifies equality of the properties and data of a `column`'s views.
@@ -443,7 +443,7 @@ struct ListsColumnTest : public cudf::test::BaseFixture {
 using NumericTypesNotBool =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
-TYPED_TEST_CASE(ListsColumnTest, NumericTypesNotBool);
+TYPED_TEST_SUITE(ListsColumnTest, NumericTypesNotBool);
 
 TYPED_TEST(ListsColumnTest, ListsColumnViewConstructor)
 {
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index f76f682bb2f..ab324ea8505 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -79,7 +79,7 @@ struct ColumnViewShallowTests : public cudf::test::BaseFixture {
 };
 
 using AllTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::CompoundTypes>;
-TYPED_TEST_CASE(ColumnViewShallowTests, AllTypes);
+TYPED_TEST_SUITE(ColumnViewShallowTests, AllTypes);
 
 // Test for fixed_width, dict, string, list, struct
 // column_view, column_view = same hash.
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index bdaa20f63bb..728b0fdf7e5 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -40,7 +40,7 @@ template <typename T>
 class NumericFactoryTest : public ColumnFactoryTest {
 };
 
-TYPED_TEST_CASE(NumericFactoryTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(NumericFactoryTest, cudf::test::NumericTypes);
 
 TYPED_TEST(NumericFactoryTest, EmptyNoMask)
 {
@@ -203,7 +203,7 @@ template <typename T>
 class FixedWidthFactoryTest : public ColumnFactoryTest {
 };
 
-TYPED_TEST_CASE(FixedWidthFactoryTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(FixedWidthFactoryTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(FixedWidthFactoryTest, EmptyNoMask)
 {
@@ -219,7 +219,7 @@ template <typename T>
 class EmptyFactoryTest : public ColumnFactoryTest {
 };
 
-TYPED_TEST_CASE(EmptyFactoryTest, cudf::test::AllTypes);
+TYPED_TEST_SUITE(EmptyFactoryTest, cudf::test::AllTypes);
 
 TYPED_TEST(EmptyFactoryTest, Empty)
 {
@@ -466,7 +466,7 @@ template <typename T>
 class ListsFixedWidthLeafTest : public ColumnFactoryTest {
 };
 
-TYPED_TEST_CASE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ListsFixedWidthLeafTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ListsFixedWidthLeafTest, FromNonNested)
 {
@@ -504,7 +504,7 @@ template <typename T>
 class ListsDictionaryLeafTest : public ColumnFactoryTest {
 };
 
-TYPED_TEST_CASE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ListsDictionaryLeafTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ListsDictionaryLeafTest, FromNonNested)
 {
@@ -613,7 +613,7 @@ class ListsStructsLeafTest : public ColumnFactoryTest {
   }
 };
 
-TYPED_TEST_CASE(ListsStructsLeafTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ListsStructsLeafTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ListsStructsLeafTest, FromNonNested)
 {
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index c48f7ad4dbc..d2279580c58 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -79,7 +79,7 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   rmm::device_buffer all_null_mask{create_null_mask(num_elements(), cudf::mask_state::ALL_NULL)};
 };
 
-TYPED_TEST_CASE(TypedColumnTest, cudf::test::Types<int32_t>);
+TYPED_TEST_SUITE(TypedColumnTest, cudf::test::Types<int32_t>);
 
 TYPED_TEST(TypedColumnTest, ConcatenateEmptyColumns)
 {
@@ -1555,7 +1555,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointConcatentate)
 {
@@ -1632,7 +1632,7 @@ template <typename T>
 struct DictionaryConcatTestFW : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(DictionaryConcatTestFW, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(DictionaryConcatTestFW, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(DictionaryConcatTestFW, FixedWidthKeys)
 {
diff --git a/cpp/tests/copying/copy_if_else_nested_tests.cpp b/cpp/tests/copying/copy_if_else_nested_tests.cpp
index d9f6210b13c..0eca46c0b88 100644
--- a/cpp/tests/copying/copy_if_else_nested_tests.cpp
+++ b/cpp/tests/copying/copy_if_else_nested_tests.cpp
@@ -32,7 +32,7 @@ template <typename T>
 struct TypedCopyIfElseNestedTest : CopyIfElseNestedTest {
 };
 
-TYPED_TEST_CASE(TypedCopyIfElseNestedTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(TypedCopyIfElseNestedTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(TypedCopyIfElseNestedTest, Structs)
 {
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 1ab320cb8dc..d3463fc3cc4 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -60,7 +60,7 @@ class CopyRangeTypedTestFixture : public cudf::test::BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(CopyRangeTypedTestFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(CopyRangeTypedTestFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(CopyRangeTypedTestFixture, CopyWithNulls)
 {
@@ -471,7 +471,7 @@ template <typename T>
 struct FixedPointTypesCopyRange : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTypesCopyRange, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTypesCopyRange, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTypesCopyRange, FixedPointSimple)
 {
diff --git a/cpp/tests/copying/copy_tests.cu b/cpp/tests/copying/copy_tests.cpp
similarity index 82%
rename from cpp/tests/copying/copy_tests.cu
rename to cpp/tests/copying/copy_tests.cpp
index 03869c37adf..651a977050c 100644
--- a/cpp/tests/copying/copy_tests.cu
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,20 +21,15 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy_if_else.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
 
 template <typename T>
 struct CopyTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(CopyTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(CopyTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 #define wrapper cudf::test::fixed_width_column_wrapper
 
@@ -66,90 +61,6 @@ TYPED_TEST(CopyTest, CopyIfElseTestManyNulls)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
 }
 
-struct copy_if_else_tiny_grid_functor {
-  template <typename T, typename Filter, CUDF_ENABLE_IF(cudf::is_rep_layout_compatible<T>())>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
-                                           cudf::column_view const& rhs,
-                                           Filter filter,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-  {
-    // output
-    std::unique_ptr<cudf::column> out =
-      cudf::allocate_like(lhs, lhs.size(), cudf::mask_allocation_policy::RETAIN, mr);
-
-    // device views
-    auto lhs_view = cudf::column_device_view::create(lhs);
-    auto rhs_view = cudf::column_device_view::create(rhs);
-    auto lhs_iter = cudf::detail::make_pair_iterator<T>(*lhs_view);
-    auto rhs_iter = cudf::detail::make_pair_iterator<T>(*rhs_view);
-    auto out_dv   = cudf::mutable_column_device_view::create(*out);
-
-    // call the kernel with an artificially small grid
-    cudf::detail::copy_if_else_kernel<32, T, decltype(lhs_iter), decltype(rhs_iter), Filter, false>
-      <<<1, 32, 0, stream.value()>>>(lhs_iter, rhs_iter, filter, *out_dv, nullptr);
-
-    return out;
-  }
-
-  template <typename T, typename Filter, CUDF_ENABLE_IF(not cudf::is_rep_layout_compatible<T>())>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& lhs,
-                                           cudf::column_view const& rhs,
-                                           Filter filter,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
-  {
-    CUDF_FAIL("Unexpected test execution");
-  }
-};
-
-std::unique_ptr<cudf::column> tiny_grid_launch(cudf::column_view const& lhs,
-                                               cudf::column_view const& rhs,
-                                               cudf::column_view const& boolean_mask)
-{
-  auto bool_mask_device_p                   = cudf::column_device_view::create(boolean_mask);
-  cudf::column_device_view bool_mask_device = *bool_mask_device_p;
-  auto filter                               = [bool_mask_device] __device__(cudf::size_type i) {
-    return bool_mask_device.element<bool>(i);
-  };
-  return cudf::type_dispatcher(lhs.type(),
-                               copy_if_else_tiny_grid_functor{},
-                               lhs,
-                               rhs,
-                               filter,
-                               rmm::cuda_stream_default,
-                               rmm::mr::get_current_device_resource());
-}
-
-TYPED_TEST(CopyTest, CopyIfElseTestTinyGrid)
-{
-  using T = TypeParam;
-
-  // make sure we span at least 2 warps
-  int num_els = 64;
-
-  bool mask[] = {1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
-                 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
-                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + num_els);
-
-  wrapper<T, int32_t> lhs_w({5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-                             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
-
-  wrapper<T, int32_t> rhs_w({6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                             6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-                             6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6});
-
-  wrapper<T, int32_t> expected_w({5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6,
-                                  6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5,
-                                  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5});
-
-  auto out = tiny_grid_launch(lhs_w, rhs_w, mask_w);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
-}
-
 TYPED_TEST(CopyTest, CopyIfElseTestLong)
 {
   using T = TypeParam;
@@ -190,6 +101,27 @@ TYPED_TEST(CopyTest, CopyIfElseTestLong)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), expected_w);
 }
 
+TYPED_TEST(CopyTest, CopyIfElseTestMultipleBlocks)
+{
+  using T = TypeParam;
+
+  int num = 1000;  // larger than a single block
+  std::vector<int32_t> h_lhs(num, 5);
+  std::vector<int32_t> h_rhs(num, 6);
+  std::vector<bool> h_mask(num, false);
+  std::vector<bool> h_validity(num, true);
+  h_validity[0] = 0;
+
+  cudf::test::fixed_width_column_wrapper<T, int32_t> lhs_w(
+    h_lhs.begin(), h_lhs.end(), h_validity.begin());
+  cudf::test::fixed_width_column_wrapper<T, int32_t> rhs_w(
+    h_rhs.begin(), h_rhs.end(), h_validity.begin());
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(h_mask.begin(), h_mask.end());
+
+  auto out = cudf::copy_if_else(lhs_w, rhs_w, mask_w);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(out->view(), rhs_w);
+}
+
 TYPED_TEST(CopyTest, CopyIfElseTestEmptyInputs)
 {
   using T = TypeParam;
@@ -367,7 +299,7 @@ TEST_F(CopyEmptyNested, CopyIfElseTestEmptyNestedScalars)
 template <typename T>
 struct CopyTestNumeric : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(CopyTestNumeric, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(CopyTestNumeric, cudf::test::NumericTypes);
 
 TYPED_TEST(CopyTestNumeric, CopyIfElseTestScalarColumn)
 {
@@ -457,7 +389,7 @@ struct create_chrono_scalar {
 template <typename T>
 struct CopyTestChrono : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(CopyTestChrono, cudf::test::ChronoTypes);
+TYPED_TEST_SUITE(CopyTestChrono, cudf::test::ChronoTypes);
 
 TYPED_TEST(CopyTestChrono, CopyIfElseTestScalarColumn)
 {
@@ -647,7 +579,7 @@ template <typename T>
 struct FixedPointTypes : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTypes, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTypes, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTypes, FixedPointSimple)
 {
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index f976a6bcf58..da72bd3cc63 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -36,7 +36,7 @@ template <typename T>
 class GatherTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(GatherTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(GatherTest, cudf::test::NumericTypes);
 
 // This test exercises using different iterator types as gather map inputs
 // to cudf::detail::gather -- device_uvector and raw pointers.
diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp
index 8249d2dc145..b26ee90c3b9 100644
--- a/cpp/tests/copying/gather_list_tests.cpp
+++ b/cpp/tests/copying/gather_list_tests.cpp
@@ -38,7 +38,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::DurationTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(GatherTestListTyped, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(GatherTestListTyped, FixedWidthTypesNotBool);
 
 class GatherTestList : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index b33651237a6..141503ed978 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -32,7 +32,7 @@ template <typename T>
 class GatherTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(GatherTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(GatherTest, cudf::test::NumericTypes);
 
 TYPED_TEST(GatherTest, IdentityTest)
 {
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index c07db17ec15..32abd2dd71d 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -41,7 +41,7 @@ template <typename T>
 struct FixedWidthGetValueTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedWidthGetValueTest, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(FixedWidthGetValueTest, FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(FixedWidthGetValueTest, BasicGet)
 {
@@ -131,7 +131,7 @@ template <typename T>
 struct DictionaryGetValueTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(DictionaryGetValueTest, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(DictionaryGetValueTest, FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(DictionaryGetValueTest, BasicGet)
 {
@@ -193,7 +193,7 @@ struct ListGetFixedWidthValueTest : public BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(ListGetFixedWidthValueTest, FixedWidthTypes);
+TYPED_TEST_SUITE(ListGetFixedWidthValueTest, FixedWidthTypes);
 
 TYPED_TEST(ListGetFixedWidthValueTest, NonNestedGetNonNullNonEmpty)
 {
@@ -603,7 +603,7 @@ struct ListGetStructValueTest : public BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(ListGetStructValueTest, FixedWidthTypes);
+TYPED_TEST_SUITE(ListGetStructValueTest, FixedWidthTypes);
 
 TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullNonEmpty)
 {
@@ -800,7 +800,7 @@ template <typename T>
 struct StructGetValueTestTyped : public BaseFixture {
 };
 
-TYPED_TEST_CASE(StructGetValueTestTyped, FixedWidthTypes);
+TYPED_TEST_SUITE(StructGetValueTestTyped, FixedWidthTypes);
 
 TYPED_TEST(StructGetValueTestTyped, mixed_types_valid)
 {
diff --git a/cpp/tests/copying/reverse_tests.cpp b/cpp/tests/copying/reverse_tests.cpp
index 3ba928f0249..314b14dbcf5 100644
--- a/cpp/tests/copying/reverse_tests.cpp
+++ b/cpp/tests/copying/reverse_tests.cpp
@@ -36,7 +36,7 @@ template <typename T>
 class ReverseTypedTestFixture : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ReverseTypedTestFixture, cudf::test::AllTypes);
+TYPED_TEST_SUITE(ReverseTypedTestFixture, cudf::test::AllTypes);
 TYPED_TEST(ReverseTypedTestFixture, ReverseTable)
 {
   using T = TypeParam;
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index d60fd82af8c..7d3de9b6c15 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -46,7 +46,7 @@ template <typename T>
 class ScatterListOfFixedWidthScalarTest : public ScatterListScalarTests {
 };
 
-TYPED_TEST_CASE(ScatterListOfFixedWidthScalarTest, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(ScatterListOfFixedWidthScalarTest, FixedWidthTypesWithoutFixedPoint);
 
 // Test grid
 // Dim1 : {Fixed width, strings, lists, structs}
@@ -211,7 +211,7 @@ template <typename T>
 class ScatterListOfListScalarTest : public ScatterListScalarTests {
 };
 
-TYPED_TEST_CASE(ScatterListOfListScalarTest, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(ScatterListOfListScalarTest, FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(ScatterListOfListScalarTest, Basic)
 {
@@ -308,7 +308,7 @@ class ScatterListOfStructScalarTest : public ScatterListScalarTests {
   }
 };
 
-TYPED_TEST_CASE(ScatterListOfStructScalarTest, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(ScatterListOfStructScalarTest, FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(ScatterListOfStructScalarTest, Basic)
 {
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index 289d1cd6de0..8713553742b 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -34,7 +34,7 @@ template <typename T>
 class TypedScatterListsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedScatterListsTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(TypedScatterListsTest, cudf::test::FixedWidthTypes);
 
 class ScatterListsTest : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
index 4cfc1029566..44e65110f33 100644
--- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -37,7 +37,7 @@ template <typename T>
 struct TypedStructScalarScatterTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedStructScalarScatterTest, FixedWidthTypes);
+TYPED_TEST_SUITE(TypedStructScalarScatterTest, FixedWidthTypes);
 
 column scatter_single_scalar(scalar const& slr, column_view scatter_map, column_view target)
 {
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
index 14d3710f0e6..bcd57259b8d 100644
--- a/cpp/tests/copying/scatter_struct_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -44,7 +44,7 @@ using TestTypes = cudf::test::Concat<cudf::test::IntegralTypes,
                                      cudf::test::DurationTypes,
                                      cudf::test::TimestampTypes>;
 
-TYPED_TEST_CASE(TypedStructScatterTest, TestTypes);
+TYPED_TEST_SUITE(TypedStructScatterTest, TestTypes);
 
 namespace {
 auto scatter_structs(std::unique_ptr<cudf::column> const& structs_src,
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index be4a689f213..e4846d4b2c6 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -146,7 +146,7 @@ class ScatterIndexTypeTests : public cudf::test::BaseFixture {
 };
 
 using IndexTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t>;
-TYPED_TEST_CASE(ScatterIndexTypeTests, IndexTypes);
+TYPED_TEST_SUITE(ScatterIndexTypeTests, IndexTypes);
 
 // Throw logic error if check_bounds is set and index is out of bounds
 TYPED_TEST(ScatterIndexTypeTests, ScatterOutOfBounds)
@@ -234,7 +234,7 @@ class ScatterInvalidIndexTypeTests : public cudf::test::BaseFixture {
 using InvalidIndexTypes = cudf::test::Concat<cudf::test::Types<float, double, bool>,
                                              cudf::test::ChronoTypes,
                                              cudf::test::FixedPointTypes>;
-TYPED_TEST_CASE(ScatterInvalidIndexTypeTests, InvalidIndexTypes);
+TYPED_TEST_SUITE(ScatterInvalidIndexTypeTests, InvalidIndexTypes);
 
 // Throw logic error if scatter map column has invalid data type
 TYPED_TEST(ScatterInvalidIndexTypeTests, ScatterInvalidIndexType)
@@ -273,7 +273,7 @@ template <typename T>
 class ScatterDataTypeTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ScatterDataTypeTests, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ScatterDataTypeTests, cudf::test::FixedWidthTypes);
 
 // Empty scatter map returns copy of input
 TYPED_TEST(ScatterDataTypeTests, EmptyScatterMap)
@@ -577,7 +577,7 @@ template <typename T>
 class BooleanMaskScatter : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(BooleanMaskScatter, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(BooleanMaskScatter, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(BooleanMaskScatter, WithNoNullElementsInTarget)
 {
@@ -744,7 +744,7 @@ struct BooleanMaskScalarScatter : public cudf::test::BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(BooleanMaskScalarScatter, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(BooleanMaskScalarScatter, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(BooleanMaskScalarScatter, WithNoNullElementsInTarget)
 {
@@ -904,7 +904,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointScatter)
 {
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index 528986e2a8d..e3a003c51d1 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -16,6 +16,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
@@ -34,9 +35,6 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::TimestampTypes>;
 TYPED_TEST_SUITE(SegmentedGatherTest, FixedWidthTypesNotBool);
 
-class SegmentedGatherTestList : public cudf::test::BaseFixture {
-};
-
 // to disambiguate between {} == 0 and {} == List{0}
 // Also, see note about compiler issues when declaring nested
 // empty lists in lists_column_wrapper documentation
@@ -44,9 +42,7 @@ template <typename T>
 using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
 using cudf::lists_column_view;
 using cudf::lists::detail::segmented_gather;
-using cudf::test::iterators::no_nulls;
-using cudf::test::iterators::null_at;
-using cudf::test::iterators::nulls_at;
+using namespace cudf::test::iterators;
 auto constexpr NULLIFY = cudf::out_of_bounds_policy::NULLIFY;
 
 TYPED_TEST(SegmentedGatherTest, Gather)
@@ -300,6 +296,25 @@ TYPED_TEST(SegmentedGatherTest, GatherNegatives)
   }
 }
 
+TYPED_TEST(SegmentedGatherTest, GatherOnNonCompactedNullLists)
+{
+  using T          = TypeParam;
+  auto constexpr X = -1;  // Signifies null value.
+
+  // List<T>
+  auto list = LCW<T>{{{1, 2, 3, 4}, {5}, {6, 7}, {8, 9, 0}, {}, {1, 2}, {3, 4, 5}}, no_nulls()};
+  auto const input = list.release();
+
+  // Set non-empty list row at index 5 to null.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 5, 6, false);
+
+  auto const gather_map = LCW<int>{{-1, 2, 1, -4}, {0}, {-2, 1}, {0, 2, 1}, {}, {0}, {1, 2}};
+  auto const expected =
+    LCW<T>{{{4, 3, 2, 1}, {5}, {6, 7}, {8, 0, 9}, {}, {{X}, all_nulls()}, {4, 5}}, null_at(5)};
+  auto const results = segmented_gather(lists_column_view{*input}, lists_column_view{gather_map});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
+
 TYPED_TEST(SegmentedGatherTest, GatherNestedNulls)
 {
   using T = TypeParam;
@@ -373,7 +388,8 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced)
     auto const split_a = cudf::split(a, {3});
 
     {
-      auto const gather_map = lists_column_view{LCW<int>{{1, 2}, {0, 2}, {0, 1}}};
+      auto const list       = LCW<int>{{1, 2}, {0, 2}, {0, 1}};
+      auto const gather_map = lists_column_view{list};
       auto const result     = segmented_gather(lists_column_view{split_a[0]}, gather_map);
       auto const expected   = LCW<T>{
         {{2, 2}, {3, 3}},
@@ -384,9 +400,9 @@ TYPED_TEST(SegmentedGatherTest, GatherSliced)
     }
 
     {
-      auto const gather_map =
-        lists_column_view{LCW<int>{{0, 1}, LCW<int>{}, LCW<int>{}, {0, 1}, LCW<int>{}}};
-      auto const result = segmented_gather(lists_column_view{split_a[1]}, gather_map);
+      auto const list       = LCW<int>{{0, 1}, LCW<int>{}, LCW<int>{}, {0, 1}, LCW<int>{}};
+      auto const gather_map = lists_column_view{list};
+      auto const result     = segmented_gather(lists_column_view{split_a[1]}, gather_map);
       auto const expected =
         LCW<T>{{{10, 10, 10}, {11, 11}}, LCW<T>{}, LCW<T>{}, {{50, 50, 50, 50}, {6, 13}}, LCW<T>{}};
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index ec4efd9313a..210b4b8f90d 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -63,7 +63,7 @@ template <typename T>
 struct ShiftTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ShiftTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ShiftTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ShiftTest, OneColumnEmpty)
 {
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index 3b45f96dce1..308cc034c16 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -36,7 +36,7 @@ template <typename T>
 struct SliceTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(SliceTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(SliceTest, cudf::test::NumericTypes);
 
 TYPED_TEST(SliceTest, NumericColumnsWithNulls)
 {
@@ -341,7 +341,7 @@ template <typename T>
 struct SliceTableTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(SliceTableTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(SliceTableTest, cudf::test::NumericTypes);
 
 TYPED_TEST(SliceTableTest, NumericColumnsWithNulls)
 {
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index de7770b4d0a..f7714ce9ac7 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -161,7 +161,7 @@ template <typename T>
 struct SplitTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(SplitTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(SplitTest, cudf::test::NumericTypes);
 
 TYPED_TEST(SplitTest, SplitEndLessThanSize)
 {
@@ -550,7 +550,7 @@ void split_empty_output_column_value(SplitFunc Split, CompareFunc Compare)
 template <typename T>
 struct SplitTableTest : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(SplitTableTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(SplitTableTest, cudf::test::NumericTypes);
 
 TYPED_TEST(SplitTableTest, SplitEndLessThanSize)
 {
@@ -1171,7 +1171,7 @@ struct ContiguousSplitTest : public cudf::test::BaseFixture {
 using FixedWidthTypesWithoutChrono =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
 
-TYPED_TEST_CASE(ContiguousSplitTest, FixedWidthTypesWithoutChrono);
+TYPED_TEST_SUITE(ContiguousSplitTest, FixedWidthTypesWithoutChrono);
 
 TYPED_TEST(ContiguousSplitTest, LongColumn)
 {
@@ -1387,7 +1387,7 @@ TEST_F(ContiguousSplitStringTableTest, NullStringColumn)
 template <typename T>
 struct ContiguousSplitTableTest : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(ContiguousSplitTableTest, FixedWidthTypesWithoutChrono);
+TYPED_TEST_SUITE(ContiguousSplitTableTest, FixedWidthTypesWithoutChrono);
 
 TYPED_TEST(ContiguousSplitTableTest, SplitEndLessThanSize)
 {
diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index c7bbe4199f0..00a22b90197 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -31,7 +31,7 @@ struct EmptyLikeTest : public cudf::test::BaseFixture {
 
 using numeric_types = cudf::test::NumericTypes;
 
-TYPED_TEST_CASE(EmptyLikeTest, numeric_types);
+TYPED_TEST_SUITE(EmptyLikeTest, numeric_types);
 
 TYPED_TEST(EmptyLikeTest, ColumnNumericTests)
 {
@@ -76,7 +76,7 @@ template <typename T>
 struct EmptyLikeScalarTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(EmptyLikeScalarTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(EmptyLikeScalarTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(EmptyLikeScalarTest, FixedWidth)
 {
@@ -191,7 +191,7 @@ struct AllocateLikeTest : public cudf::test::BaseFixture {
 };
 ;
 
-TYPED_TEST_CASE(AllocateLikeTest, numeric_types);
+TYPED_TEST_SUITE(AllocateLikeTest, numeric_types);
 
 TYPED_TEST(AllocateLikeTest, ColumnNumericTestSameSize)
 {
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 1d3e87279e5..c0d2d1cc447 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -40,7 +40,7 @@ struct NonTimestampTest : public cudf::test::BaseFixture {
 using NonTimestampTypes =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::StringTypes, cudf::test::DurationTypes>;
 
-TYPED_TEST_CASE(NonTimestampTest, NonTimestampTypes);
+TYPED_TEST_SUITE(NonTimestampTest, NonTimestampTypes);
 
 TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
 {
@@ -152,7 +152,7 @@ struct TypedDatetimeOpsTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
-TYPED_TEST_CASE(TypedDatetimeOpsTest, cudf::test::TimestampTypes);
+TYPED_TEST_SUITE(TypedDatetimeOpsTest, cudf::test::TimestampTypes);
 
 TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns)
 {
@@ -534,7 +534,7 @@ template <typename T>
 struct TypedAddMonthsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedAddMonthsTest, ValidMonthIntegerType);
+TYPED_TEST_SUITE(TypedAddMonthsTest, ValidMonthIntegerType);
 
 TYPED_TEST(TypedAddMonthsTest, TestAddMonthsWithSeconds)
 {
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index f8c62a08c20..fddaa9d2050 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -163,7 +163,7 @@ struct AtomicsTest : public cudf::test::BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(AtomicsTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(AtomicsTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 // tests for atomicAdd/Min/Max
 TYPED_TEST(AtomicsTest, atomicOps)
@@ -319,7 +319,7 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
 using BitwiseOpTestingTypes =
   cudf::test::Types<int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t>;
 
-TYPED_TEST_CASE(AtomicsBitwiseOpTest, BitwiseOpTestingTypes);
+TYPED_TEST_SUITE(AtomicsBitwiseOpTest, BitwiseOpTestingTypes);
 
 TYPED_TEST(AtomicsBitwiseOpTest, atomicBitwiseOps)
 {
diff --git a/cpp/tests/encode/encode_tests.cpp b/cpp/tests/encode/encode_tests.cpp
index 73c77a39a97..9cf7d226f9a 100644
--- a/cpp/tests/encode/encode_tests.cpp
+++ b/cpp/tests/encode/encode_tests.cpp
@@ -28,7 +28,7 @@ class EncodeNumericTests : public cudf::test::BaseFixture {
 using NumericTypesNotBool =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
-TYPED_TEST_CASE(EncodeNumericTests, NumericTypesNotBool);
+TYPED_TEST_SUITE(EncodeNumericTests, NumericTypesNotBool);
 
 TYPED_TEST(EncodeNumericTests, SingleNullEncode)
 {
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 75c0cad20e7..6ced6e545d5 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -95,7 +95,7 @@ class FillTypedTestFixture : public cudf::test::BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(FillTypedTestFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(FillTypedTestFixture, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(FillTypedTestFixture, SetSingle)
 {
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index 6c4f0f6b7f0..7d30298b1bd 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -45,7 +45,7 @@ class RepeatTypedTestFixture : public cudf::test::BaseFixture,
   cudf::size_type repeat_count() { return this->generate(); }
 };
 
-TYPED_TEST_CASE(RepeatTypedTestFixture, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(RepeatTypedTestFixture, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(RepeatTypedTestFixture, RepeatScalarCount)
 {
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index a1d551ca62d..383a69affa1 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -23,6 +23,8 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/filling.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/unary.hpp>
 
 using namespace cudf;
 using namespace cudf::test;
@@ -36,7 +38,7 @@ class SequenceTestFixture : public cudf::test::BaseFixture {
 
 using NumericTypesNoBool = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
-TYPED_TEST_CASE(SequenceTypedTestFixture, NumericTypesNoBool);
+TYPED_TEST_SUITE(SequenceTypedTestFixture, NumericTypesNoBool);
 
 TYPED_TEST(SequenceTypedTestFixture, Incrementing)
 {
@@ -134,3 +136,52 @@ TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected_w);
 }
+
+TEST_F(SequenceTestFixture, DateSequenceBasic)
+{
+  // Timestamp generated using https://www.epochconverter.com/
+  timestamp_scalar<timestamp_s> init(1629852896L, true);  // 2021-08-25 00:54:56 GMT
+  size_type size{5};
+  size_type months{1};
+
+  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+    1629852896L,  // 2021-08-25 00:54:56 GMT
+    1632531296L,  // 2021-09-25 00:54:56 GMT
+    1635123296L,  // 2021-10-25 00:54:56 GMT
+    1637801696L,  // 2021-11-25 00:54:56 GMT
+    1640393696L,  // 2021-12-25 00:54:56 GMT
+  };
+
+  auto got = calendrical_month_sequence(size, init, months);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *got);
+}
+
+TEST_F(SequenceTestFixture, DateSequenceLeapYear)
+{
+  // Timestamp generated using https://www.epochconverter.com/
+  timestamp_scalar<timestamp_s> init(951876379L, true);  // 2000-02-29 02:06:19 GMT
+  size_type size{5};
+  size_type months{12};
+
+  fixed_width_column_wrapper<timestamp_s, int64_t> expected{
+    951876379L,   // 2000-02-29 02:06:19 GMT Leap Year
+    983412379L,   // 2001-02-28 02:06:19 GMT
+    1014948379L,  // 2002-02-28 02:06:19 GMT
+    1046484379L,  // 2003-02-28 02:06:19 GMT
+    1078106779L,  // 2004-02-29 02:06:19 GMT Leap Year
+  };
+
+  auto got = calendrical_month_sequence(size, init, months);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *got);
+}
+
+TEST_F(SequenceTestFixture, DateSequenceBadTypes)
+{
+  numeric_scalar<int64_t> init(951876379, true);
+  size_type size   = 5;
+  size_type months = 12;
+
+  EXPECT_THROW(calendrical_month_sequence(size, init, months), cudf::logic_error);
+}
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index ced809c243d..339585756c9 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -40,7 +40,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
 
 TYPED_TEST(FixedPointTestBothReps, SimpleDecimalXXConstruction)
 {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 7244b913a6a..c650a7191be 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -45,7 +45,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, RepresentationTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, RepresentationTypes);
 
 TYPED_TEST(FixedPointTestBothReps, DecimalXXThrust)
 {
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index 7cf693f7b08..0b06c184b75 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_argmax_test : public cudf::test::BaseFixture {
 };
 using K = int32_t;
 
-TYPED_TEST_CASE(groupby_argmax_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_argmax_test, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(groupby_argmax_test, basic)
 {
@@ -182,6 +182,78 @@ TEST_F(groupby_dictionary_argmax_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_argmax_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmax_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{0, 4, 2};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmax_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys           = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals           = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{0, 4, 2};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmax_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{{0, 4, 2, null}, null_at(3)};
+
+  auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
 
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index 915575546c9..67235a64066 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_argmin_test : public cudf::test::BaseFixture {
 };
 using K = int32_t;
 
-TYPED_TEST_CASE(groupby_argmin_test, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_argmin_test, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(groupby_argmin_test, basic)
 {
@@ -183,5 +183,77 @@ TEST_F(groupby_dictionary_argmin_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_argmin_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_argmin_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{3, 5, 7};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmin_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys           = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals           = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{3, 5, 7};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
+TEST_F(groupby_argmin_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys    = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_indices = fixed_width_column_wrapper<int32_t>{{3, 1, 8, null}, null_at(3)};
+
+  auto agg = cudf::make_argmin_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_indices, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/collect_list_tests.cpp b/cpp/tests/groupby/collect_list_tests.cpp
index 009917dabae..8a724526dbf 100644
--- a/cpp/tests/groupby/collect_list_tests.cpp
+++ b/cpp/tests/groupby/collect_list_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_collect_list_test : public cudf::test::BaseFixture {
 using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(groupby_collect_list_test, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(groupby_collect_list_test, FixedWidthTypesNotBool);
 
 TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls)
 {
@@ -174,21 +174,17 @@ TYPED_TEST(groupby_collect_list_test, CollectOnEmptyInputListsOfStructs)
   auto struct_column = structs_column_wrapper{{struct_child}};
 
   auto values = cudf::make_lists_column(
-    0, make_empty_column(data_type{type_to_id<offset_type>()}), struct_column.release(), 0, {});
+    0, make_empty_column(type_to_id<offset_type>()), struct_column.release(), 0, {});
 
   fixed_width_column_wrapper<K, int32_t> expect_keys{};
 
   auto expect_struct_child  = LCW{};
   auto expect_struct_column = structs_column_wrapper{{expect_struct_child}};
 
-  auto expect_child =
-    cudf::make_lists_column(0,
-                            make_empty_column(data_type{type_to_id<offset_type>()}),
-                            expect_struct_column.release(),
-                            0,
-                            {});
+  auto expect_child = cudf::make_lists_column(
+    0, make_empty_column(type_to_id<offset_type>()), expect_struct_column.release(), 0, {});
   auto expect_values = cudf::make_lists_column(
-    0, make_empty_column(data_type{type_to_id<offset_type>()}), std::move(expect_child), 0, {});
+    0, make_empty_column(type_to_id<offset_type>()), std::move(expect_child), 0, {});
 
   auto agg = cudf::make_collect_list_aggregation<groupby_aggregation>();
   test_single_agg(keys, values->view(), expect_keys, expect_values->view(), std::move(agg));
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 198caabfca9..c429dc72259 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -57,7 +57,7 @@ struct CollectSetTypedTest : public cudf::test::BaseFixture {
 using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(CollectSetTypedTest, FixedWidthTypesNotBool);
 
 TYPED_TEST(CollectSetTypedTest, TrivialInput)
 {
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
new file mode 100644
index 00000000000..44db3eb859f
--- /dev/null
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <limits>
+#include <vector>
+
+using namespace cudf::test::iterators;
+namespace cudf {
+namespace test {
+
+constexpr auto nan = std::numeric_limits<double>::quiet_NaN();
+using structs      = structs_column_wrapper;
+
+template <typename V>
+struct groupby_correlation_test : public cudf::test::BaseFixture {
+};
+
+using supported_types = RemoveIf<ContainedIn<Types<bool>>, cudf::test::NumericTypes>;
+
+TYPED_TEST_SUITE(groupby_correlation_test, supported_types);
+using K = int32_t;
+
+TYPED_TEST(groupby_correlation_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, 0.6, nan}};
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> member_0{}, member_1{};
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_nulls());
+  fixed_width_column_wrapper<V> member_0{3, 4, 5}, member_1{6, 7, 8};
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> member_0({3, 4, 5}, all_nulls());
+  fixed_width_column_wrapper<V> member_1({3, 4, 5}, all_nulls());
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, null_values_same)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0.6, nan, 0.}, {1, 1, 1, 0});
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+// keys=[1, 1, 1, 2, 2, 2, 2,   3, N, 3, 4]
+// val0=[N, 2, 3, 1, N, 3, 4,   1,-1, 1, 4]
+// val1=[N, 2, 3, 2,-1, 6,-6/1, 1,-1, 0, N]
+// corr=[    1.0,       -0.5/0, NAN,     NAN]
+TYPED_TEST(groupby_correlation_test, null_values_different)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> val1({1, 2, 1, 2,-1, 6, 3,-1, 0, 1, 2},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals({1.0, 0., nan, 0.}, {1, 1, 1, 0});
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_correlation_test, min_periods)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{1.0, 0.6, nan}};
+  auto agg1 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 3);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals2{{1.0, 0.6, nan}, {0, 1, 0}};
+  auto agg2 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 4);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals3{{1.0, 0.6, nan}, {0, 0, 0}};
+  auto agg3 =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON, 5);
+  test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_correlation_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_correlation_test, basic)
+{
+  using V = int16_t;
+  using R = cudf::detail::target_type_t<V, aggregation::CORRELATION>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = dictionary_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = dictionary_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, 0.6, nan}};
+
+  auto agg =
+    cudf::make_correlation_aggregation<groupby_aggregation>(cudf::correlation_type::PEARSON);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 62e8b11241d..062efe6094e 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -38,7 +38,7 @@ struct groupby_count_scan_test : public cudf::test::BaseFixture {
   using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
 };
 
-TYPED_TEST_CASE(groupby_count_scan_test, cudf::test::AllTypes);
+TYPED_TEST_SUITE(groupby_count_scan_test, cudf::test::AllTypes);
 
 TYPED_TEST(groupby_count_scan_test, basic)
 {
@@ -159,7 +159,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupByCountScan)
 {
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index cbb821767c9..c70f3fd942a 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_count_test : public cudf::test::BaseFixture {
 };
 using K = int32_t;
 
-TYPED_TEST_CASE(groupby_count_test, cudf::test::AllTypes);
+TYPED_TEST_SUITE(groupby_count_test, cudf::test::AllTypes);
 
 TYPED_TEST(groupby_count_test, basic)
 {
@@ -172,7 +172,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupByCount)
 {
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
new file mode 100644
index 00000000000..a6b0545ccd5
--- /dev/null
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <tests/groupby/groupby_test_util.hpp>
+
+#include <limits>
+#include <vector>
+
+using namespace cudf::test::iterators;
+namespace cudf {
+namespace test {
+
+using structs = structs_column_wrapper;
+
+template <typename V>
+struct groupby_covariance_test : public cudf::test::BaseFixture {
+};
+
+using supported_types = RemoveIf<ContainedIn<Types<bool>>, cudf::test::NumericTypes>;
+
+TYPED_TEST_SUITE(groupby_covariance_test, supported_types);
+using K = int32_t;
+
+TYPED_TEST(groupby_covariance_test, basic)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, 1.0, 0.0}};
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, empty_cols)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  fixed_width_column_wrapper<K> keys{};
+  fixed_width_column_wrapper<V> member_0{}, member_1{};
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, zero_valid_keys)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  fixed_width_column_wrapper<K> keys({1, 2, 3}, all_nulls());
+  fixed_width_column_wrapper<V> member_0{3, 4, 5}, member_1{6, 7, 8};
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{};
+  fixed_width_column_wrapper<R> expect_vals{};
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, zero_valid_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  fixed_width_column_wrapper<K> keys{1, 1, 1};
+  fixed_width_column_wrapper<V> member_0({3, 4, 5}, all_nulls());
+  fixed_width_column_wrapper<V> member_1({3, 4, 5}, all_nulls());
+  auto vals = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1};
+  fixed_width_column_wrapper<R> expect_vals({0}, all_nulls());
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, null_keys_and_values)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals({0.5, 1.0, 0.0, -0.}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, null_values_same)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2},
+                                     {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals({0.5, 1.0, 0.0, -0.}, {1, 1, 1, 0});
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, null_values_different)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  // clang-format off
+  fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
+                                     {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+  fixed_width_column_wrapper<V> val1({1, 2, 1, 2,-1, 3, 3,-1, 0, 4, 2},
+                                     {0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
+  // clang-format on
+  auto vals = structs{{val0, val1}};
+
+  fixed_width_column_wrapper<K> expect_keys({1, 2, 3, 4}, no_nulls());
+  fixed_width_column_wrapper<R> expect_vals(
+    {std::numeric_limits<double>::quiet_NaN(), 1.5, 0.0, -0.}, {0, 1, 1, 0});
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, min_periods)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{1.0, 1.0, 0.0}};
+  auto agg1 = cudf::make_covariance_aggregation<groupby_aggregation>(3);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals2{{1.0, 1.0, 0.0}, {0, 1, 0}};
+  auto agg2 = cudf::make_covariance_aggregation<groupby_aggregation>(4);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+
+  fixed_width_column_wrapper<R, double> expect_vals3{{1.0, 1.0, 0.0}, {0, 0, 0}};
+  auto agg3 = cudf::make_covariance_aggregation<groupby_aggregation>(5);
+  test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg3), force_use_sort_impl::YES);
+}
+
+TYPED_TEST(groupby_covariance_test, ddof)
+{
+  using V = TypeParam;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = fixed_width_column_wrapper<V>{{1, 1, 1, 2, 0, 3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+
+  fixed_width_column_wrapper<R, double> expect_vals1{{2.0, 1.5, 0.0}};
+  auto agg1 = cudf::make_covariance_aggregation<groupby_aggregation>(1, 2);
+  test_single_agg(keys, vals, expect_keys, expect_vals1, std::move(agg1), force_use_sort_impl::YES);
+
+  auto const inf = std::numeric_limits<double>::infinity();
+  fixed_width_column_wrapper<R, double> expect_vals2{{inf, 3.0, 0.0}, {0, 1, 0}};
+  auto agg2 = cudf::make_covariance_aggregation<groupby_aggregation>(1, 3);
+  test_single_agg(keys, vals, expect_keys, expect_vals2, std::move(agg2), force_use_sort_impl::YES);
+}
+
+struct groupby_dictionary_covariance_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_dictionary_covariance_test, basic)
+{
+  using V = int16_t;
+  using R = cudf::detail::target_type_t<V, aggregation::COVARIANCE>;
+
+  auto keys     = fixed_width_column_wrapper<K>{{1, 2, 3, 1, 2, 2, 1, 3, 3, 2}};
+  auto member_0 = dictionary_column_wrapper<V>{{1, 1, 1, 2, 2, 3, 3, 1, 1, 4}};
+  auto member_1 = dictionary_column_wrapper<V>{{1, 1, 1, 2, 3, -3, 3, 1, 1, 2}};
+  auto vals     = structs{{member_0, member_1}};
+
+  fixed_width_column_wrapper<K> expect_keys{1, 2, 3};
+  fixed_width_column_wrapper<R, double> expect_vals{{1.0, -0.5, 0.0}};
+
+  auto agg = cudf::make_covariance_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg), force_use_sort_impl::YES);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/groupby/groups_tests.cpp b/cpp/tests/groupby/groups_tests.cpp
index 7ab771c3c36..2ca359e0838 100644
--- a/cpp/tests/groupby/groups_tests.cpp
+++ b/cpp/tests/groupby/groups_tests.cpp
@@ -34,7 +34,7 @@ template <typename V>
 struct groupby_group_keys_and_values_test : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_group_keys_and_values_test, NumericTypes);
+TYPED_TEST_SUITE(groupby_group_keys_and_values_test, NumericTypes);
 
 TEST_F(groupby_group_keys_test, basic)
 {
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 683eeb7eb01..94c26f3fe8f 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -34,7 +34,7 @@ struct groupby_keys_test : public cudf::test::BaseFixture {
 using supported_types = cudf::test::
   Types<int8_t, int16_t, int32_t, int64_t, float, double, numeric::decimal32, numeric::decimal64>;
 
-TYPED_TEST_CASE(groupby_keys_test, supported_types);
+TYPED_TEST_SUITE(groupby_keys_test, supported_types);
 
 TYPED_TEST(groupby_keys_test, basic)
 {
@@ -289,5 +289,66 @@ TEST_F(groupby_dictionary_keys_test, basic)
                   force_use_sort_impl::YES);
 }
 
+struct groupby_cache_test : public cudf::test::BaseFixture {
+};
+
+// To check if the cache doesn't insert multiple times to cache for same aggregation on a column in
+// same request.
+// If this test fails, then insert happened and key stored in cache map becomes dangling reference.
+// Any comparison with same aggregation as key will fail.
+TEST_F(groupby_cache_test, duplicate_agggregations)
+{
+  using K = int32_t;
+  using V = int32_t;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  groupby::groupby gb_obj(table_view({keys}));
+
+  std::vector<groupby::aggregation_request> requests;
+  requests.emplace_back(groupby::aggregation_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+
+  // hash groupby
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+
+  // sort groupby
+  // WAR to force groupby to use sort implementation
+  requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+}
+
+// To check if the cache doesn't insert multiple times to cache for same aggregation on same column
+// but in different requests.
+// If this test fails, then insert happened and key stored in cache map becomes dangling reference.
+// Any comparison with same aggregation as key will fail.
+TEST_F(groupby_cache_test, duplicate_columns)
+{
+  using K = int32_t;
+  using V = int32_t;
+
+  fixed_width_column_wrapper<K> keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+  groupby::groupby gb_obj(table_view({keys}));
+
+  std::vector<groupby::aggregation_request> requests;
+  requests.emplace_back(groupby::aggregation_request());
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+  requests.emplace_back(groupby::aggregation_request());
+  requests[1].values = vals;
+  requests[1].aggregations.push_back(cudf::make_sum_aggregation<groupby_aggregation>());
+
+  // hash groupby
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+
+  // sort groupby
+  // WAR to force groupby to use sort implementation
+  requests[0].aggregations.push_back(make_nth_element_aggregation<groupby_aggregation>(0));
+  EXPECT_NO_THROW(gb_obj.aggregate(requests));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 4d83dc9f7ba..bb2f87fd424 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -39,7 +39,7 @@ struct groupby_max_scan_test : public cudf::test::BaseFixture {
   using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
 };
 
-TYPED_TEST_CASE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_max_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_max_scan_test, basic)
 {
@@ -128,11 +128,27 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
+struct groupby_max_scan_string_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_scan_string_test, basic)
+{
+  key_wrapper keys{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
+
+  key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  strings_column_wrapper expect_vals(
+    {"año", "año", "año", "bit", "zit", "zit", "zit", "₹1", "₹1", "₹1"});
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
 {
@@ -157,5 +173,90 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxScanDecimalAsValue)
   }
 }
 
+struct groupby_max_scan_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_scan_struct_test, basic)
+{
+  auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "año", "año", "bit", "zit", "zit", "zit", "₹1", "₹1", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 1, 1, 2, 5, 5, 5, 3, 3, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_scan_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original =
+    key_wrapper{dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = key_wrapper{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "año", "año", "bit", "zit", "zit", "zit", "₹1", "₹1", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 1, 1, 2, 5, 5, 5, 3, 3, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_scan_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys = key_wrapper{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = key_wrapper{{1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "año", "" /*NULL*/, "bit", "zit", "" /*NULL*/, "zit", "₹1", "₹1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 9, null, 8, 5, null, 5, 7, 7, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({2, 5, 9})};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 491e6927304..8d15401aa09 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -33,7 +33,7 @@ struct groupby_max_test : public cudf::test::BaseFixture {
 };
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_max_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_max_test, basic)
 {
@@ -255,7 +255,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMaxDecimalAsValue)
 {
@@ -304,5 +304,89 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMaxDecimalAsValue)
   }
 }
 
+struct groupby_max_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_max_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 5, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 5, 3};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_max_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"año", "zit", "₹1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 5, 7, null};
+    return structs_column_wrapper{{child1, child2}, null_at(3)};
+  }();
+
+  auto agg = cudf::make_max_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index d390c8a1880..ab794bf22df 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -48,7 +48,7 @@ std::vector<Target> convert(std::initializer_list<Source> in)
 using supported_types =
   cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
                      cudf::test::DurationTypes>;
-TYPED_TEST_CASE(groupby_mean_test, supported_types);
+TYPED_TEST_SUITE(groupby_mean_test, supported_types);
 using K = int32_t;
 
 TYPED_TEST(groupby_mean_test, basic)
@@ -164,7 +164,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMeanDecimalAsValue)
 {
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
index 86d89325401..087b104539e 100644
--- a/cpp/tests/groupby/median_tests.cpp
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -34,7 +34,7 @@ struct groupby_median_test : public cudf::test::BaseFixture {
 using K               = int32_t;
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
-TYPED_TEST_CASE(groupby_median_test, supported_types);
+TYPED_TEST_SUITE(groupby_median_test, supported_types);
 
 TYPED_TEST(groupby_median_test, basic)
 {
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index b6b1d1a1720..7c24c6267ca 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -60,7 +60,7 @@ struct GroupbyMergeListsTypedTest : public cudf::test::BaseFixture {
 using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(GroupbyMergeListsTypedTest, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(GroupbyMergeListsTypedTest, FixedWidthTypesNotBool);
 
 TYPED_TEST(GroupbyMergeListsTypedTest, InvalidInput)
 {
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 5a65774b430..1e2f0c9fa9e 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -60,7 +60,7 @@ struct GroupbyMergeSetsTypedTest : public cudf::test::BaseFixture {
 using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(GroupbyMergeSetsTypedTest, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(GroupbyMergeSetsTypedTest, FixedWidthTypesNotBool);
 
 TYPED_TEST(GroupbyMergeSetsTypedTest, InvalidInput)
 {
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index 452f70eaf16..06c0f5ceb3b 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -38,7 +38,7 @@ struct groupby_min_scan_test : public cudf::test::BaseFixture {
   using result_wrapper = fixed_width_column_wrapper<R, int32_t>;
 };
 
-TYPED_TEST_CASE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_min_scan_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_min_scan_test, basic)
 {
@@ -135,18 +135,18 @@ TEST_F(groupby_min_scan_string_test, basic)
   strings_column_wrapper vals{"año", "bit", "₹1", "aaa", "zit", "bat", "aaa", "$1", "₹1", "wut"};
 
   key_wrapper expect_keys{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
-  strings_column_wrapper expect_vals;
+  strings_column_wrapper expect_vals(
+    {"año", "aaa", "aaa", "bit", "bit", "bat", "bat", "₹1", "$1", "$1"});
 
   auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg)),
-                            "Unsupported groupby scan type-agg combination");
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
 template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
 {
@@ -172,5 +172,90 @@ TYPED_TEST(FixedPointTestBothReps, GroupBySortMinScanDecimalAsValue)
   }
 }
 
+struct groupby_min_scan_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_scan_struct_test, basic)
+{
+  auto const keys = key_wrapper{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "aaa", "aaa", "bit", "bit", "bat", "bat", "₹1", "$1", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 4, 4, 2, 2, 6, 6, 3, 8, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_scan_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original =
+    key_wrapper{dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = key_wrapper{dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = key_wrapper{1, 1, 1, 2, 2, 2, 2, 3, 3, 3};
+  auto const expect_vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "aaa", "aaa", "bit", "bit", "bat", "bat", "₹1", "$1", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 4, 4, 2, 2, 6, 6, 3, 8, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_scan_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys = key_wrapper{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = key_wrapper{{1, 1, 1, 2, 2, 2, 2, 3, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "aaa", "" /*NULL*/, "bit", "bit", "" /*NULL*/, "bit", "₹1", "€1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 6, null, 8, 8, null, 8, 7, 1, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({2, 5, 9})};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_scan_aggregation>();
+  test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 4f8db1d750c..c2cfca83b29 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -33,7 +33,7 @@ struct groupby_min_test : public cudf::test::BaseFixture {
 };
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_min_test, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_min_test, basic)
 {
@@ -255,7 +255,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortMinDecimalAsValue)
 {
@@ -303,5 +303,89 @@ TYPED_TEST(FixedPointTestBothReps, GroupByHashMinDecimalAsValue)
   }
 }
 
+struct groupby_min_struct_test : public cudf::test::BaseFixture {
+};
+
+TEST_F(groupby_min_struct_test, basic)
+{
+  auto const keys = fixed_width_column_wrapper<int32_t>{1, 2, 3, 1, 2, 2, 1, 3, 3, 2};
+  auto const vals = [] {
+    auto child1 =
+      strings_column_wrapper{"año", "bit", "₹1", "aaa", "zit", "bat", "aab", "$1", "€1", "wut"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bat", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{4, 6, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_struct_test, slice_input)
+{
+  constexpr int32_t dont_care{1};
+  auto const keys_original = fixed_width_column_wrapper<int32_t>{
+    dont_care, dont_care, 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, dont_care};
+  auto const vals_original = [] {
+    auto child1 = strings_column_wrapper{"dont_care",
+                                         "dont_care",
+                                         "año",
+                                         "bit",
+                                         "₹1",
+                                         "aaa",
+                                         "zit",
+                                         "bat",
+                                         "aab",
+                                         "$1",
+                                         "€1",
+                                         "wut",
+                                         "dont_care"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{
+      dont_care, dont_care, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, dont_care};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto const keys        = cudf::slice(keys_original, {2, 12})[0];
+  auto const vals        = cudf::slice(vals_original, {2, 12})[0];
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{1, 2, 3};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bat", "$1"};
+    auto child2 = fixed_width_column_wrapper<int32_t>{4, 6, 8};
+    return structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
+TEST_F(groupby_min_struct_test, null_keys_and_values)
+{
+  constexpr int32_t null{0};
+  auto const keys =
+    fixed_width_column_wrapper<int32_t>{{1, 2, 3, 1, 2, 2, 1, null, 3, 2, 4}, null_at(7)};
+  auto const vals = [] {
+    auto child1 = strings_column_wrapper{
+      "año", "bit", "₹1", "aaa", "zit", "" /*NULL*/, "" /*NULL*/, "$1", "€1", "wut", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{9, 8, 7, 6, 5, null, null, 2, 1, 0, null};
+    return structs_column_wrapper{{child1, child2}, nulls_at({5, 6, 10})};
+  }();
+
+  auto const expect_keys = fixed_width_column_wrapper<int32_t>{{1, 2, 3, 4}, no_nulls()};
+  auto const expect_vals = [] {
+    auto child1 = strings_column_wrapper{"aaa", "bit", "€1", "" /*NULL*/};
+    auto child2 = fixed_width_column_wrapper<int32_t>{6, 8, 1, null};
+    return structs_column_wrapper{{child1, child2}, null_at(3)};
+  }();
+
+  auto agg = cudf::make_min_aggregation<groupby_aggregation>();
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index 47dfa2426eb..976064de344 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -33,7 +33,7 @@ template <typename V>
 struct groupby_nth_element_test : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_nth_element_test, cudf::test::AllTypes);
+TYPED_TEST_SUITE(groupby_nth_element_test, cudf::test::AllTypes);
 
 // clang-format off
 TYPED_TEST(groupby_nth_element_test, basic)
@@ -372,7 +372,7 @@ template <typename T>
 struct groupby_nth_element_lists_test : BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(groupby_nth_element_lists_test, FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(groupby_nth_element_lists_test, Basics)
 {
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
index 88a6a1c903b..b18fa59d706 100644
--- a/cpp/tests/groupby/nunique_tests.cpp
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -32,7 +32,7 @@ struct groupby_nunique_test : public cudf::test::BaseFixture {
 };
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_nunique_test, cudf::test::AllTypes);
+TYPED_TEST_SUITE(groupby_nunique_test, cudf::test::AllTypes);
 
 TYPED_TEST(groupby_nunique_test, basic)
 {
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
index 047bf856493..6f6263d90cc 100644
--- a/cpp/tests/groupby/product_tests.cpp
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -34,7 +34,7 @@ struct groupby_product_test : public cudf::test::BaseFixture {
 using K               = int32_t;
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
-TYPED_TEST_CASE(groupby_product_test, supported_types);
+TYPED_TEST_SUITE(groupby_product_test, supported_types);
 
 TYPED_TEST(groupby_product_test, basic)
 {
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
index 43b065ee4d3..d7a434335f9 100644
--- a/cpp/tests/groupby/quantile_tests.cpp
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -34,7 +34,7 @@ struct groupby_quantile_test : public cudf::test::BaseFixture {
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
 using K = int32_t;
-TYPED_TEST_CASE(groupby_quantile_test, supported_types);
+TYPED_TEST_SUITE(groupby_quantile_test, supported_types);
 
 TYPED_TEST(groupby_quantile_test, basic)
 {
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index d08bf011618..1b1b12ea69e 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -64,7 +64,7 @@ struct typed_groupby_rank_scan_test : public BaseFixture {
 using testing_type_set =
   Concat<IntegralTypesNotBool, FloatingPointTypes, FixedPointTypes, ChronoTypes>;
 
-TYPED_TEST_CASE(typed_groupby_rank_scan_test, testing_type_set);
+TYPED_TEST_SUITE(typed_groupby_rank_scan_test, testing_type_set);
 
 TYPED_TEST(typed_groupby_rank_scan_test, empty_cols)
 {
@@ -323,7 +323,7 @@ using list_test_type_set = Concat<IntegralTypesNotBool,
                                               FloatingPointTypes,
                                               FixedPointTypes>;
 
-TYPED_TEST_CASE(list_groupby_rank_scan_test, list_test_type_set);
+TYPED_TEST_SUITE(list_groupby_rank_scan_test, list_test_type_set);
 
 TYPED_TEST(list_groupby_rank_scan_test, lists)
 {
diff --git a/cpp/tests/groupby/replace_nulls_tests.cpp b/cpp/tests/groupby/replace_nulls_tests.cpp
index 3618f531fc0..7543050d0ef 100644
--- a/cpp/tests/groupby/replace_nulls_tests.cpp
+++ b/cpp/tests/groupby/replace_nulls_tests.cpp
@@ -38,7 +38,7 @@ template <typename T>
 struct GroupbyReplaceNullsFixedWidthTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(GroupbyReplaceNullsFixedWidthTest, FixedWidthTypes);
+TYPED_TEST_SUITE(GroupbyReplaceNullsFixedWidthTest, FixedWidthTypes);
 
 template <typename K, typename V>
 void TestReplaceNullsGroupbySingle(
@@ -172,7 +172,7 @@ template <typename T>
 struct GroupbyReplaceNullsListsTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(GroupbyReplaceNullsListsTest, FixedWidthTypes);
+TYPED_TEST_SUITE(GroupbyReplaceNullsListsTest, FixedWidthTypes);
 
 TYPED_TEST(GroupbyReplaceNullsListsTest, PrecedingFillNonNested)
 {
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index 3a934071427..3135ed8f033 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -32,7 +32,7 @@ template <typename T>
 struct groupby_shift_fixed_width_test : public BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_shift_fixed_width_test, FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_shift_fixed_width_test, FixedWidthTypes);
 
 template <typename V>
 void test_groupby_shift_fixed_width_single(fixed_width_column_wrapper<K> const& key,
@@ -370,7 +370,7 @@ template <typename T>
 struct groupby_shift_mixed_test : public BaseFixture {
 };
 
-TYPED_TEST_CASE(groupby_shift_mixed_test, FixedWidthTypes);
+TYPED_TEST_SUITE(groupby_shift_mixed_test, FixedWidthTypes);
 
 void test_groupby_shift_multi(fixed_width_column_wrapper<K> const& key,
                               table_view const& value,
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
index e2edabf3e8f..27f1deea844 100644
--- a/cpp/tests/groupby/std_tests.cpp
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -36,7 +36,7 @@ struct groupby_std_test : public cudf::test::BaseFixture {
 using K               = int32_t;
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
-TYPED_TEST_CASE(groupby_std_test, supported_types);
+TYPED_TEST_SUITE(groupby_std_test, supported_types);
 
 TYPED_TEST(groupby_std_test, basic)
 {
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
index 0dab2c6483e..4f4d15be089 100644
--- a/cpp/tests/groupby/sum_of_squares_tests.cpp
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -34,7 +34,7 @@ struct groupby_sum_of_squares_test : public cudf::test::BaseFixture {
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 using K               = int32_t;
 
-TYPED_TEST_CASE(groupby_sum_of_squares_test, supported_types);
+TYPED_TEST_SUITE(groupby_sum_of_squares_test, supported_types);
 
 TYPED_TEST(groupby_sum_of_squares_test, basic)
 {
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index 86fc0238597..2de32b70d14 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -42,7 +42,7 @@ using supported_types =
   cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
                      cudf::test::DurationTypes>;
 
-TYPED_TEST_CASE(groupby_sum_scan_test, supported_types);
+TYPED_TEST_SUITE(groupby_sum_scan_test, supported_types);
 
 TYPED_TEST(groupby_sum_scan_test, basic)
 {
@@ -136,7 +136,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortSumScanDecimalAsValue)
 {
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 5c935ee5a9d..1aa6358b7b2 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -36,7 +36,7 @@ using supported_types =
   cudf::test::Concat<cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>,
                      cudf::test::DurationTypes>;
 
-TYPED_TEST_CASE(groupby_sum_test, supported_types);
+TYPED_TEST_SUITE(groupby_sum_test, supported_types);
 
 TYPED_TEST(groupby_sum_test, basic)
 {
@@ -160,7 +160,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, GroupBySortSumDecimalAsValue)
 {
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 818999867c1..1199dfb44f2 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -42,7 +42,23 @@ typedef thrust::tuple<size_type, double, double> expected_value;
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(TDigestAllTypes, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(TDigestAllTypes, cudf::test::NumericTypes);
+
+template <typename T>
+struct column_min {
+  __device__ double operator()(device_span<T const> vals)
+  {
+    return static_cast<double>(*thrust::min_element(thrust::seq, vals.begin(), vals.end()));
+  }
+};
+
+template <typename T>
+struct column_max {
+  __device__ double operator()(device_span<T const> vals)
+  {
+    return static_cast<double>(*thrust::max_element(thrust::seq, vals.begin(), vals.end()));
+  }
+};
 
 struct tdigest_gen {
   template <
@@ -69,15 +85,11 @@ struct tdigest_gen {
   }
 };
 
-void tdigest_sample_compare(column_view const& result,
+void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
                             std::vector<expected_value> const& h_expected)
 {
-  cudf::detail::tdigest::check_is_valid_tdigest_column(result);
-  cudf::structs_column_view scv(result);
-  cudf::lists_column_view lcv(scv.child(cudf::detail::tdigest::centroid_column_index));
-  cudf::structs_column_view tdigests(lcv.child());
-  column_view result_mean   = tdigests.child(cudf::detail::tdigest::mean_column_index);
-  column_view result_weight = tdigests.child(cudf::detail::tdigest::weight_column_index);
+  column_view result_mean   = tdv.means();
+  column_view result_weight = tdv.weights();
 
   auto expected_mean = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
@@ -114,45 +126,95 @@ void tdigest_sample_compare(column_view const& result,
 }
 
 template <typename T>
-std::unique_ptr<column> make_expected_tdigest(column_view const& mean,
-                                              column_view const& weight,
-                                              T min,
-                                              T max)
+void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
+                            column_view const& input_values)
 {
-  std::vector<std::unique_ptr<column>> inner_children;
-  inner_children.push_back(std::make_unique<cudf::column>(mean));
-  inner_children.push_back(std::make_unique<cudf::column>(weight));
-  // tdigest struct
-  auto tdigests = cudf::make_structs_column(mean.size(), std::move(inner_children), 0, {});
-
-  std::vector<offset_type> h_offsets{0, mean.size()};
-  auto offsets =
-    cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
-  cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
-             h_offsets.data(),
-             sizeof(offset_type) * 2,
-             cudaMemcpyHostToDevice);
-
-  auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
-
-  auto min_col =
-    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
-               min_col->mutable_view().begin<double>(),
-               min_col->mutable_view().end<double>(),
-               static_cast<double>(min));
-  auto max_col =
-    cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
-               max_col->mutable_view().begin<double>(),
-               max_col->mutable_view().end<double>(),
-               static_cast<double>(max));
-
-  std::vector<std::unique_ptr<column>> children;
-  children.push_back(std::move(list));
-  children.push_back(std::move(min_col));
-  children.push_back(std::move(max_col));
-  return make_structs_column(1, std::move(children), 0, {});
+  // verify min/max
+  thrust::host_vector<device_span<T const>> h_spans;
+  h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
+  thrust::device_vector<device_span<T const>> spans(h_spans);
+
+  auto expected_min = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    spans.begin(),
+                    spans.end(),
+                    expected_min->mutable_view().template begin<double>(),
+                    column_min<T>{});
+  column_view result_min(data_type{type_id::FLOAT64}, tdv.size(), tdv.min_begin());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_min, *expected_min);
+
+  auto expected_max = cudf::make_fixed_width_column(
+    data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(rmm::cuda_stream_default),
+                    spans.begin(),
+                    spans.end(),
+                    expected_max->mutable_view().template begin<double>(),
+                    column_max<T>{});
+  column_view result_max(data_type{type_id::FLOAT64}, tdv.size(), tdv.max_begin());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result_max, *expected_max);
+}
+
+struct expected_tdigest {
+  column_view mean;
+  column_view weight;
+  double min, max;
+};
+
+std::unique_ptr<column> make_expected_tdigest_column(std::vector<expected_tdigest> const& groups)
+{
+  std::vector<std::unique_ptr<column>> tdigests;
+
+  // make an individual digest
+  auto make_digest = [&](expected_tdigest const& tdigest) {
+    std::vector<std::unique_ptr<column>> inner_children;
+    inner_children.push_back(std::make_unique<cudf::column>(tdigest.mean));
+    inner_children.push_back(std::make_unique<cudf::column>(tdigest.weight));
+    // tdigest struct
+    auto tdigests =
+      cudf::make_structs_column(tdigest.mean.size(), std::move(inner_children), 0, {});
+
+    std::vector<offset_type> h_offsets{0, tdigest.mean.size()};
+    auto offsets =
+      cudf::make_fixed_width_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED);
+    cudaMemcpy(offsets->mutable_view().begin<offset_type>(),
+               h_offsets.data(),
+               sizeof(offset_type) * 2,
+               cudaMemcpyHostToDevice);
+
+    auto list = cudf::make_lists_column(1, std::move(offsets), std::move(tdigests), 0, {});
+
+    auto min_col =
+      cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+    thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+                 min_col->mutable_view().begin<double>(),
+                 min_col->mutable_view().end<double>(),
+                 tdigest.min);
+    auto max_col =
+      cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
+    thrust::fill(rmm::exec_policy(rmm::cuda_stream_default),
+                 max_col->mutable_view().begin<double>(),
+                 max_col->mutable_view().end<double>(),
+                 tdigest.max);
+
+    std::vector<std::unique_ptr<column>> children;
+    children.push_back(std::move(list));
+    children.push_back(std::move(min_col));
+    children.push_back(std::move(max_col));
+    return make_structs_column(1, std::move(children), 0, {});
+  };
+
+  // build the individual digests
+  std::transform(groups.begin(), groups.end(), std::back_inserter(tdigests), make_digest);
+
+  // concatenate them
+  std::vector<column_view> views;
+  std::transform(tdigests.begin(),
+                 tdigests.end(),
+                 std::back_inserter(views),
+                 [](std::unique_ptr<column> const& c) { return c->view(); });
+
+  return cudf::concatenate(views);
 }
 
 TYPED_TEST(TDigestAllTypes, Simple)
@@ -172,7 +234,10 @@ TYPED_TEST(TDigestAllTypes, Simple)
   auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
   double const min = 1;
   double const max = 126;
-  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+  auto expected    = make_expected_tdigest_column({{*mean,
+                                                 weight,
+                                                 static_cast<double>(static_cast<T>(min)),
+                                                 static_cast<double>(static_cast<T>(max))}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -195,7 +260,10 @@ TYPED_TEST(TDigestAllTypes, SimpleWithNulls)
   auto mean        = cudf::cast(raw_mean, data_type{type_id::FLOAT64});
   double const min = 1;
   double const max = 122;
-  auto expected = make_expected_tdigest<T>(*mean, weight, static_cast<T>(min), static_cast<T>(max));
+  auto expected    = make_expected_tdigest_column({{*mean,
+                                                 weight,
+                                                 static_cast<double>(static_cast<T>(min)),
+                                                 static_cast<double>(static_cast<T>(max))}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -288,6 +356,32 @@ TYPED_TEST(TDigestAllTypes, LargeGroups)
 struct TDigestTest : public cudf::test::BaseFixture {
 };
 
+TEST_F(TDigestTest, EmptyMixed)
+{
+  cudf::test::fixed_width_column_wrapper<double> values{
+    {123456.78, 10.0, 20.0, 25.0, 30.0, 40.0, 50.0, 60.0, 70.0}, {1, 0, 0, 1, 0, 0, 1, 1, 0}};
+  cudf::test::strings_column_wrapper keys{"b", "a", "c", "c", "d", "d", "e", "e", "f"};
+
+  auto const delta = 1000;
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  using FCW     = cudf::test::fixed_width_column_wrapper<double>;
+  auto expected = make_expected_tdigest_column({{FCW{}, FCW{}, 0, 0},
+                                                {FCW{123456.78}, FCW{1.0}, 123456.78, 123456.78},
+                                                {FCW{25.0}, FCW{1.0}, 25.0, 25.0},
+                                                {FCW{}, FCW{}, 0, 0},
+                                                {FCW{50.0, 60.0}, FCW{1.0, 1.0}, 50.0, 60.0},
+                                                {FCW{}, FCW{}, 0, 0}});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result.second[0].results[0], *expected);
+}
+
 TEST_F(TDigestTest, LargeInputDouble)
 {
   // these tests are being done explicitly because of the way we have to precompute the correct
@@ -322,8 +416,12 @@ TEST_F(TDigestTest, LargeInputDouble)
                                          {409, 94.07685720279611985006, 1272},
                                          {491, 99.94197663121231300920, 130},
                                          {500, 99.99969880795092080916, 2}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
+
+    tdigest_sample_compare(tdv, expected);
 
-    tdigest_sample_compare(*result, expected);
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *values);
   }
 
   // delta 100
@@ -338,8 +436,12 @@ TEST_F(TDigestTest, LargeInputDouble)
                                          {38, 90.61229683516096145013, 15581},
                                          {46, 99.07283498858802772702, 5142},
                                          {50, 99.99970905482754801596, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
+
+    tdigest_sample_compare(tdv, expected);
 
-    tdigest_sample_compare(*result, expected);
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *values);
   }
 
   // delta 10
@@ -353,8 +455,12 @@ TEST_F(TDigestTest, LargeInputDouble)
                                          {3, 83.46216572053654658703, 187500},
                                          {4, 96.42204425201593664951, 71620},
                                          {5, 99.99970905482754801596, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
 
-    tdigest_sample_compare(*result, expected);
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *values);
   }
 }
 
@@ -395,8 +501,12 @@ TEST_F(TDigestTest, LargeInputInt)
                                          {418, 95, 1157},
                                          {479, 99, 307},
                                          {500, 99, 2}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
+
+    tdigest_sample_compare(tdv, expected);
 
-    tdigest_sample_compare(*result, expected);
+    // verify min/max
+    tdigest_minmax_compare<int>(tdv, *values);
   }
 
   // delta 100
@@ -411,8 +521,12 @@ TEST_F(TDigestTest, LargeInputInt)
                                          {38, 90.14209614273795523332, 15581},
                                          {46, 98.64041229093737683797, 5142},
                                          {50, 99, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
 
-    tdigest_sample_compare(*result, expected);
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    tdigest_minmax_compare<int>(tdv, *values);
   }
 
   // delta 10
@@ -426,8 +540,12 @@ TEST_F(TDigestTest, LargeInputInt)
                                          {3, 82.96355733333332693746, 187500},
                                          {4, 95.91280368612116546956, 71620},
                                          {5, 99, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
+
+    tdigest_sample_compare(tdv, expected);
 
-    tdigest_sample_compare(*result, expected);
+    // verify min/max
+    tdigest_minmax_compare<int>(tdv, *values);
   }
 }
 
@@ -441,6 +559,7 @@ TEST_F(TDigestTest, LargeInputDecimal)
   // decimal, int, bool)
 
   auto values = generate_standardized_percentile_distribution(data_type{type_id::DECIMAL32, -4});
+  auto cast_values = cudf::cast(*values, data_type{type_id::FLOAT64});
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
@@ -465,8 +584,12 @@ TEST_F(TDigestTest, LargeInputDecimal)
                                          {409, 94.07680636792450457051, 1272},
                                          {491, 99.94192461538463589932, 130},
                                          {500, 99.99965000000000259206, 2}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
 
-    tdigest_sample_compare(*result, expected);
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *cast_values);
   }
 
   // delta 100
@@ -481,8 +604,12 @@ TEST_F(TDigestTest, LargeInputDecimal)
                                          {38, 90.61224673640975879607, 15581},
                                          {46, 99.07278498638662256326, 5142},
                                          {50, 99.99970000000000425189, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
+
+    tdigest_sample_compare(tdv, expected);
 
-    tdigest_sample_compare(*result, expected);
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *cast_values);
   }
 
   // delta 10
@@ -496,8 +623,12 @@ TEST_F(TDigestTest, LargeInputDecimal)
                                          {3, 83.46211575573336460820, 187500},
                                          {4, 96.42199425300195514410, 71620},
                                          {5, 99.99970000000000425189, 1}};
+    cudf::tdigest::tdigest_column_view tdv(*result);
 
-    tdigest_sample_compare(*result, expected);
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *cast_values);
   }
 }
 
@@ -562,6 +693,9 @@ TEST_F(TDigestMergeTest, Simple)
     requests.push_back({*merge_input, std::move(aggregations)});
     auto result = gb.aggregate(requests);
 
+    cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
+
+    // verify centroids
     std::vector<expected_value> expected{{0, 0.00013945158577498588, 2},
                                          {10, 0.04804393446447510763, 50},
                                          {59, 1.68846964439246893797, 284},
@@ -575,10 +709,279 @@ TEST_F(TDigestMergeTest, Simple)
                                          {625, 98.20470345147104751504, 405},
                                          {700, 99.96818381983835877236, 56},
                                          {711, 99.99970905482754801596, 1}};
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    tdigest_minmax_compare<double>(tdv, *values);
+  }
+}
+
+struct key_groups {
+  __device__ size_type operator()(size_type i) { return i < 250000 ? 0 : 1; }
+};
+TEST_F(TDigestMergeTest, Grouped)
+{
+  auto values = generate_standardized_percentile_distribution(data_type{type_id::FLOAT64});
+  CUDF_EXPECTS(values->size() == 750000, "Unexpected distribution size");
+  // all in the same group
+  auto keys = cudf::make_fixed_width_column(
+    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
+  // 3 groups. 0-250000 in group 0.  250000-500000 in group 1 and 500000-750000 in group 1
+  auto key_iter = cudf::detail::make_counting_transform_iterator(0, key_groups{});
+  thrust::copy(rmm::exec_policy(rmm::cuda_stream_default),
+               key_iter,
+               key_iter + keys->size(),
+               keys->mutable_view().template begin<int>());
+
+  auto split_values         = cudf::split(*values, {250000, 500000});
+  auto grouped_split_values = cudf::split(*values, {250000});
+  auto split_keys           = cudf::split(*keys, {250000, 500000});
+
+  int const delta = 1000;
+
+  // generate seperate digests
+  std::vector<std::unique_ptr<column>> parts;
+  auto iter = thrust::make_counting_iterator(0);
+  std::transform(
+    iter,
+    iter + split_values.size(),
+    std::back_inserter(parts),
+    [&split_keys, &split_values, delta](int i) {
+      cudf::table_view t({split_keys[i]});
+      cudf::groupby::groupby gb(t);
+      std::vector<cudf::groupby::aggregation_request> requests;
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.push_back(cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+      requests.push_back({split_values[i], std::move(aggregations)});
+      auto result = gb.aggregate(requests);
+      return std::move(result.second[0].results[0]);
+    });
+  std::vector<column_view> part_views;
+  std::transform(parts.begin(),
+                 parts.end(),
+                 std::back_inserter(part_views),
+                 [](std::unique_ptr<column> const& col) { return col->view(); });
+
+  // merge delta = 1000
+  {
+    int const merge_delta = 1000;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 1, 1};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+
+    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
+
+    // verify centroids
+    std::vector<expected_value> expected{// group 0
+                                         {0, 0.00013945158577498588, 2},
+                                         {10, 0.04804393446447509375, 50},
+                                         {66, 2.10089484962640948851, 316},
+                                         {139, 8.92977366346101852912, 601},
+                                         {243, 23.89152910016953867967, 784},
+                                         {366, 41.62636569363655780762, 586},
+                                         {432, 47.73085102980330418632, 326},
+                                         {460, 49.20637897385523018556, 196},
+                                         {501, 49.99998311512171511595, 1},
+                                         // group 1
+                                         {502 + 0, 50.00022508669655252334, 2},
+                                         {502 + 15, 50.05415694538910287292, 74},
+                                         {502 + 70, 51.21421484112906341579, 334},
+                                         {502 + 150, 55.19367617848146778670, 635},
+                                         {502 + 260, 63.24605285552920008740, 783},
+                                         {502 + 380, 76.99522005804017510400, 1289},
+                                         {502 + 440, 84.22673817294192133431, 758},
+                                         {502 + 490, 88.11787981529532487457, 784},
+                                         {502 + 555, 93.02766411136053648079, 704},
+                                         {502 + 618, 96.91486035315536184953, 516},
+                                         {502 + 710, 99.87755861436669135855, 110},
+                                         {502 + 733, 99.99970905482754801596, 1}};
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    auto split_results = cudf::split(*result.second[0].results[0], {1});
+    auto iter          = thrust::make_counting_iterator(0);
+    std::for_each(iter, iter + split_results.size(), [&](size_type i) {
+      auto copied = std::make_unique<column>(split_results[i]);
+      tdigest_minmax_compare<double>(cudf::tdigest::tdigest_column_view(*copied),
+                                     grouped_split_values[i]);
+    });
+  }
+
+  // merge delta = 100
+  {
+    int const merge_delta = 100;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 1, 1};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
+
+    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
+
+    // verify centroids
+    std::vector<expected_value> expected{// group 0
+                                         {0, 0.02182479870203561656, 231},
+                                         {3, 0.60625795002234528219, 1688},
+                                         {13, 8.40462931740497687372, 5867},
+                                         {27, 28.79997783486397722186, 7757},
+                                         {35, 40.22391421196020644402, 6224},
+                                         {45, 48.96506331299028857984, 2225},
+                                         {50, 49.99979491345574444949, 4},
+                                         // group 1
+                                         {51 + 0, 50.02171921312970681583, 460},
+                                         {51 + 5, 51.45308398121498072442, 5074},
+                                         {51 + 11, 55.96880716301625113829, 10011},
+                                         {51 + 22, 70.18029861315150697010, 15351},
+                                         {51 + 38, 92.65943436519887654867, 10718},
+                                         {51 + 47, 99.27745505225347244505, 3639}};
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    auto split_results = cudf::split(*result.second[0].results[0], {1});
+    auto iter          = thrust::make_counting_iterator(0);
+    std::for_each(iter, iter + split_results.size(), [&](size_type i) {
+      auto copied = std::make_unique<column>(split_results[i]);
+      tdigest_minmax_compare<double>(cudf::tdigest::tdigest_column_view(*copied),
+                                     grouped_split_values[i]);
+    });
+  }
+
+  // merge delta = 10
+  {
+    int const merge_delta = 10;
+
+    // merge them
+    auto merge_input = cudf::concatenate(part_views);
+    cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 1, 1};
+    cudf::table_view key_table({merge_keys});
+    cudf::groupby::groupby gb(key_table);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.push_back(
+      cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(merge_delta));
+    requests.push_back({*merge_input, std::move(aggregations)});
+    auto result = gb.aggregate(requests);
 
-    tdigest_sample_compare(*result.second[0].results[0], expected);
+    CUDF_EXPECTS(result.second[0].results[0]->size() == 2, "Unexpected tdigest merge result size");
+    cudf::tdigest::tdigest_column_view tdv(*result.second[0].results[0]);
+
+    // verify centroids
+    std::vector<expected_value> expected{// group 0
+                                         {0, 2.34644806683495144028, 23623},
+                                         {1, 10.95523693698660672169, 62290},
+                                         {2, 24.90731657803452847588, 77208},
+                                         {3, 38.88062495289155862110, 62658},
+                                         {4, 47.56288303840698006297, 24217},
+                                         {5, 49.99979491345574444949, 4},
+                                         // group 1
+                                         {6 + 0, 52.40174463129091719793, 47410},
+                                         {6 + 1, 60.97025126481504031517, 124564},
+                                         {6 + 2, 74.91722742839780835311, 154387},
+                                         {6 + 3, 88.87559489177009197647, 124810},
+                                         {6 + 4, 97.55823307073454486726, 48817},
+                                         {6 + 5, 99.99901807905750672489, 12}};
+    tdigest_sample_compare(tdv, expected);
+
+    // verify min/max
+    auto split_results = cudf::split(*result.second[0].results[0], {1});
+    auto iter          = thrust::make_counting_iterator(0);
+    std::for_each(iter, iter + split_results.size(), [&](size_type i) {
+      auto copied = std::make_unique<column>(split_results[i]);
+      tdigest_minmax_compare<double>(cudf::tdigest::tdigest_column_view(*copied),
+                                     grouped_split_values[i]);
+    });
   }
 }
 
+TEST_F(TDigestMergeTest, Empty)
+{
+  // 3 empty tdigests all in the same group
+  auto a = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto b = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto c = cudf::detail::tdigest::make_empty_tdigest_column();
+  std::vector<column_view> cols;
+  cols.push_back(*a);
+  cols.push_back(*b);
+  cols.push_back(*c);
+  auto values = cudf::concatenate(cols);
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0};
+
+  auto const delta = 1000;
+  cudf::table_view t({keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({*values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
+}
+
+TEST_F(TDigestMergeTest, EmptyGroups)
+{
+  cudf::test::fixed_width_column_wrapper<double> values_b{{126, 15, 1, 99, 67, 55, 2},
+                                                          {1, 0, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<double> values_d{{100, 200, 300, 400, 500, 600, 700},
+                                                          {1, 1, 1, 1, 1, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
+  int const delta = 1000;
+
+  auto a = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto b = cudf::type_dispatcher(
+    static_cast<column_view>(values_b).type(), tdigest_gen{}, keys, values_b, delta);
+  auto c = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto d = cudf::type_dispatcher(
+    static_cast<column_view>(values_d).type(), tdigest_gen{}, keys, values_d, delta);
+  auto e = cudf::detail::tdigest::make_empty_tdigest_column();
+
+  std::vector<column_view> cols;
+  cols.push_back(*a);
+  cols.push_back(*b);
+  cols.push_back(*c);
+  cols.push_back(*d);
+  cols.push_back(*e);
+  auto values = cudf::concatenate(cols);
+
+  cudf::test::fixed_width_column_wrapper<int> merge_keys{0, 0, 1, 0, 2};
+
+  cudf::table_view t({merge_keys});
+  cudf::groupby::groupby gb(t);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.push_back(cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta));
+  requests.push_back({*values, std::move(aggregations)});
+  auto result = gb.aggregate(requests);
+
+  using FCW = cudf::test::fixed_width_column_wrapper<double>;
+  cudf::test::fixed_width_column_wrapper<double> expected_means{
+    2, 55, 67, 99, 100, 126, 200, 300, 400, 500, 600};
+  cudf::test::fixed_width_column_wrapper<double> expected_weights{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto expected = make_expected_tdigest_column(
+    {{expected_means, expected_weights, 2, 600}, {FCW{}, FCW{}, 0, 0}, {FCW{}, FCW{}, 0, 0}});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
+}
+
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index 68ccf791960..cc87ece1c65 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -36,7 +36,7 @@ using K = int32_t;
 
 using supported_types = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, float, double>;
 
-TYPED_TEST_CASE(groupby_var_test, supported_types);
+TYPED_TEST_SUITE(groupby_var_test, supported_types);
 
 TYPED_TEST(groupby_var_test, basic)
 {
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index a747646d894..94648ef00b1 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -73,7 +73,7 @@ using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 
-TYPED_TEST_CASE(InsertTest, TestTypes);
+TYPED_TEST_SUITE(InsertTest, TestTypes);
 
 template <typename map_type, typename pair_type>
 struct insert_pair {
diff --git a/cpp/tests/hash_map/multimap_test.cu b/cpp/tests/hash_map/multimap_test.cu
index 21135746227..4a0e3807a4c 100644
--- a/cpp/tests/hash_map/multimap_test.cu
+++ b/cpp/tests/hash_map/multimap_test.cu
@@ -81,7 +81,7 @@ typedef ::testing::Types<KeyValueTypes<int, int>,
                          KeyValueTypes<unsigned long long int, unsigned long long int>>
   Implementations;
 
-TYPED_TEST_CASE(MultimapTest, Implementations);
+TYPED_TEST_SUITE(MultimapTest, Implementations);
 
 TYPED_TEST(MultimapTest, InitialState)
 {
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index a605fdee6a0..ee321b761db 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -133,7 +133,7 @@ template <typename T>
 class HashTestTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(HashTestTyped, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(HashTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(HashTestTyped, Equality)
 {
@@ -194,7 +194,7 @@ template <typename T>
 class HashTestFloatTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(HashTestFloatTyped, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(HashTestFloatTyped, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(HashTestFloatTyped, TestExtremes)
 {
@@ -576,7 +576,7 @@ template <typename T>
 class MD5HashTestTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(MD5HashTestTyped, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes);
 
 TYPED_TEST(MD5HashTestTyped, Equality)
 {
@@ -637,7 +637,7 @@ class MD5HashListTestTyped : public cudf::test::BaseFixture {
 };
 
 using NumericTypesNoBools = Concat<IntegralTypesNotBool, FloatingPointTypes>;
-TYPED_TEST_CASE(MD5HashListTestTyped, NumericTypesNoBools);
+TYPED_TEST_SUITE(MD5HashListTestTyped, NumericTypesNoBools);
 
 TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls)
 {
@@ -669,7 +669,7 @@ template <typename T>
 class MD5HashTestFloatTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(MD5HashTestFloatTyped, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(MD5HashTestFloatTyped, TestExtremes)
 {
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 4d8a94f276d..7ea5d1edb97 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -210,7 +210,7 @@ template <typename T>
 class DLPackTimestampTests : public BaseFixture {
 };
 
-TYPED_TEST_CASE(DLPackTimestampTests, ChronoTypes);
+TYPED_TEST_SUITE(DLPackTimestampTests, ChronoTypes);
 
 TYPED_TEST(DLPackTimestampTests, ChronoTypesToDlpack)
 {
@@ -227,7 +227,7 @@ class DLPackNumericTests : public BaseFixture {
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5353
 using SupportedTypes =
   cudf::test::RemoveIf<cudf::test::ContainedIn<cudf::test::Types<bool>>, cudf::test::NumericTypes>;
-TYPED_TEST_CASE(DLPackNumericTests, SupportedTypes);
+TYPED_TEST_SUITE(DLPackNumericTests, SupportedTypes);
 
 TYPED_TEST(DLPackNumericTests, ToDlpack1D)
 {
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index ae8808ba59d..52d5da8f6e5 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -62,7 +62,7 @@ template <typename T>
 struct FromArrowTestDurationsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FromArrowTestDurationsTest, cudf::test::DurationTypes);
+TYPED_TEST_SUITE(FromArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(FromArrowTest, EmptyTable)
 {
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 00d625175d0..9ad546d3e01 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -159,7 +159,7 @@ template <typename T>
 struct ToArrowTestDurationsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ToArrowTestDurationsTest, cudf::test::DurationTypes);
+TYPED_TEST_SUITE(ToArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(ToArrowTest, EmptyTable)
 {
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 5b6270a8be1..b7835b4d4d1 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -81,7 +81,7 @@ template <typename T>
 struct CsvFixedPointWriterTest : public CsvWriterTest {
 };
 
-TYPED_TEST_CASE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(CsvFixedPointWriterTest, cudf::test::FixedPointTypes);
 
 // Base test fixture for tests
 struct CsvReaderTest : public cudf::test::BaseFixture {
@@ -94,7 +94,7 @@ struct CsvReaderNumericTypeTest : public CsvReaderTest {
 
 // Declare typed test cases
 using SupportedNumericTypes = cudf::test::Types<int64_t, double>;
-TYPED_TEST_CASE(CsvReaderNumericTypeTest, SupportedNumericTypes);
+TYPED_TEST_SUITE(CsvReaderNumericTypeTest, SupportedNumericTypes);
 
 // Typed test to be instantiated for numeric::decimal32 and numeric::decimal64
 template <typename DecimalType>
@@ -125,7 +125,7 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
   }
 };
 
-TYPED_TEST_CASE(CsvFixedPointReaderTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(CsvFixedPointReaderTest, cudf::test::FixedPointTypes);
 
 namespace {
 // Generates a vector of uniform random values of type T
@@ -407,7 +407,8 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
   auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv";
 
   cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table)
+      .include_header(false);
 
   cudf_io::write_csv(writer_options);
 
@@ -453,7 +454,8 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
   auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv";
 
   cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table);
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table)
+      .include_header(false);
 
   cudf_io::write_csv(writer_options);
 
@@ -2198,4 +2200,32 @@ TEST_F(CsvReaderTest, DtypesMapInvalid)
   EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
 }
 
+TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
+{
+  auto const filepath = temp_env->get_temp_dir() + "issue.csv";
+
+  // make up some kind of dataframe
+  auto int_column = column_wrapper<int32_t>{10, 20, 30};
+  auto str_column = column_wrapper<cudf::string_view>{"abc", "mno", "xyz"};
+  cudf::table_view input_table(std::vector<cudf::column_view>{int_column, str_column});
+
+  // write that dataframe to a csv using default options to some temporary file
+  cudf_io::csv_writer_options writer_options =
+    cudf_io::csv_writer_options::builder(cudf_io::sink_info{filepath}, input_table);
+  cudf_io::write_csv(writer_options);
+
+  // read the temp csv file using default options
+  cudf_io::csv_reader_options read_options =
+    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()});
+
+  cudf_io::table_with_metadata new_table_and_metadata = cudf_io::read_csv(read_options);
+
+  // verify that the tables are identical, or as identical as expected.
+  const auto new_table_view = new_table_and_metadata.tbl->view();
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, new_table_view);
+  EXPECT_EQ(new_table_and_metadata.metadata.column_names[0], "0");
+  EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index cdf0a3b275b..0633dfbf791 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -99,11 +99,11 @@ struct OrcWriterTimestampTypeTest : public OrcWriterTest {
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5351
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
-TYPED_TEST_CASE(OrcWriterNumericTypeTest, SupportedTypes);
+TYPED_TEST_SUITE(OrcWriterNumericTypeTest, SupportedTypes);
 using SupportedTimestampTypes =
   cudf::test::RemoveIf<cudf::test::ContainedIn<cudf::test::Types<cudf::timestamp_D>>,
                        cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(OrcWriterTimestampTypeTest, SupportedTimestampTypes);
+TYPED_TEST_SUITE(OrcWriterTimestampTypeTest, SupportedTimestampTypes);
 
 // Base test fixture for chunked writer tests
 struct OrcChunkedWriterTest : public cudf::test::BaseFixture {
@@ -116,7 +116,7 @@ struct OrcChunkedWriterNumericTypeTest : public OrcChunkedWriterTest {
 };
 
 // Declare typed test cases
-TYPED_TEST_CASE(OrcChunkedWriterNumericTypeTest, SupportedTypes);
+TYPED_TEST_SUITE(OrcChunkedWriterNumericTypeTest, SupportedTypes);
 
 // Test fixture for reader tests
 struct OrcReaderTest : public cudf::test::BaseFixture {
@@ -306,6 +306,31 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
+{
+  constexpr int64_t max = std::numeric_limits<int64_t>::max();
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, validity);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .timestamp_type(this->type());
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 TEST_F(OrcWriterTest, MultiColumn)
 {
   constexpr auto num_rows = 10;
@@ -975,7 +1000,7 @@ TEST_F(OrcStatisticsTest, Basic)
   auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
 
   auto const expected_column_names =
-    std::vector<std::string>{"col0", "_col0", "_col1", "_col2", "_col3", "_col4"};
+    std::vector<std::string>{"", "_col0", "_col1", "_col2", "_col3", "_col4"};
   EXPECT_EQ(stats.column_names, expected_column_names);
 
   auto validate_statistics = [&](std::vector<cudf_io::column_statistics> const& stats) {
@@ -1210,5 +1235,176 @@ TEST_F(OrcStatisticsTest, Overflow)
   check_sum_exist(3, true);
   check_sum_exist(4, true);
 }
+struct OrcWriterTestStripes
+  : public OrcWriterTest,
+    public ::testing::WithParamInterface<std::tuple<size_t, cudf::size_type>> {
+};
+
+TEST_P(OrcWriterTestStripes, StripeSize)
+{
+  constexpr auto num_rows = 1000000;
+  auto size_bytes         = std::get<0>(GetParam());
+  auto size_rows          = std::get<1>(GetParam());
+
+  const auto seq_col = random_values<int>(num_rows);
+  const auto validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  column_wrapper<int64_t> col{seq_col.begin(), seq_col.end(), validity};
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col.release());
+  const auto expected = std::make_unique<table>(std::move(cols));
+
+  auto validate = [&](std::vector<char> const& orc_buffer) {
+    auto const expected_stripe_num =
+      std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
+    auto const stats = cudf_io::read_parsed_orc_statistics(
+      cudf_io::source_info(orc_buffer.data(), orc_buffer.size()));
+    EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
+
+    cudf_io::orc_reader_options in_opts =
+      cudf_io::orc_reader_options::builder(
+        cudf_io::source_info(orc_buffer.data(), orc_buffer.size()))
+        .use_index(false);
+    auto result = cudf_io::read_orc(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
+  };
+
+  {
+    std::vector<char> out_buffer_chunked;
+    cudf_io::chunked_orc_writer_options opts =
+      cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info(&out_buffer_chunked))
+        .stripe_size_rows(size_rows)
+        .stripe_size_bytes(size_bytes);
+    cudf_io::orc_chunked_writer(opts).write(expected->view());
+    validate(out_buffer_chunked);
+  }
+  {
+    std::vector<char> out_buffer;
+    cudf_io::orc_writer_options out_opts =
+      cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
+        .stripe_size_rows(size_rows)
+        .stripe_size_bytes(size_bytes);
+    cudf_io::write_orc(out_opts);
+    validate(out_buffer);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(OrcWriterTest,
+                        OrcWriterTestStripes,
+                        ::testing::Values(std::make_tuple(800000ul, 1000000),
+                                          std::make_tuple(2000000ul, 1000000),
+                                          std::make_tuple(4000000ul, 1000000),
+                                          std::make_tuple(8000000ul, 1000000),
+                                          std::make_tuple(8000000ul, 500000),
+                                          std::make_tuple(8000000ul, 250000),
+                                          std::make_tuple(8000000ul, 100000)));
+
+TEST_F(OrcWriterTest, StripeSizeInvalid)
+{
+  const auto unused_table = std::make_unique<table>();
+  std::vector<char> out_buffer;
+
+  EXPECT_THROW(
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .stripe_size_rows(511),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .stripe_size_bytes(63 << 10),
+    cudf::logic_error);
+  EXPECT_THROW(
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+      .row_index_stride(511),
+    cudf::logic_error);
+}
+
+TEST_F(OrcWriterTest, TestMap)
+{
+  auto const num_rows       = 1200000;
+  auto const lists_per_row  = 4;
+  auto const num_child_rows = (num_rows * lists_per_row) / 2;  // half due to validity
+
+  auto keys      = random_values<int>(num_child_rows);
+  auto vals      = random_values<float>(num_child_rows);
+  auto keys_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto vals_mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  column_wrapper<int> keys_col{keys.begin(), keys.end(), keys_mask};
+  column_wrapper<float> vals_col{vals.begin(), vals.end(), vals_mask};
+  auto struct_col = cudf::test::structs_column_wrapper({keys_col, vals_col}).release();
+
+  auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
+
+  std::vector<int> row_offsets(num_rows + 1);
+  int offset = 0;
+  for (int idx = 0; idx < (num_rows) + 1; ++idx) {
+    row_offsets[idx] = offset;
+    if (valids[idx]) { offset += lists_per_row; }
+  }
+  cudf::test::fixed_width_column_wrapper<int> offsets(row_offsets.begin(), row_offsets.end());
+
+  auto num_list_rows = static_cast<cudf::column_view>(offsets).size() - 1;
+  auto list_col =
+    cudf::make_lists_column(num_list_rows,
+                            offsets.release(),
+                            std::move(struct_col),
+                            cudf::UNKNOWN_NULL_COUNT,
+                            cudf::test::detail::make_null_mask(valids, valids + num_list_rows));
+
+  table_view expected({*list_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+
+  auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
+  auto result = cudf_io::read_orc(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
+}
+
+TEST_F(OrcReaderTest, NestedColumnSelection)
+{
+  auto const num_rows  = 1000;
+  auto child_col1_data = random_values<int32_t>(num_rows);
+  auto child_col2_data = random_values<int64_t>(num_rows);
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
+  column_wrapper<int32_t> child_col1 = {child_col1_data.begin(), child_col1_data.end(), validity};
+  column_wrapper<int64_t> child_col2 = {child_col2_data.begin(), child_col2_data.end(), validity};
+  auto struct_col                    = cudf::test::structs_column_wrapper{child_col1, child_col2};
+  table_view expected({struct_col});
+
+  cudf_io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("struct_s");
+  expected_metadata.column_metadata[0].child(0).set_name("field_a");
+  expected_metadata.column_metadata[0].child(1).set_name("field_b");
+
+  auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
+  cudf_io::orc_writer_options out_opts =
+    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf_io::write_orc(out_opts);
+
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+      .use_index(false)
+      .columns({"struct_s.field_b"});
+  auto result = cudf_io::read_orc(in_opts);
+
+  // Verify that only one child column is included in the output table
+  ASSERT_EQ(1, result.tbl->view().column(0).num_children());
+  // Verify that the first child column is `field_b`
+  column_wrapper<int64_t> expected_col = {child_col2_data.begin(), child_col2_data.end(), validity};
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_col, result.tbl->view().column(0).child(0));
+  ASSERT_EQ("field_b", result.metadata.schema_info[0].children[0].name);
+}
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 0f59b0d5e15..3bae8d7ab1e 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -209,12 +209,22 @@ struct ParquetWriterChronoTypeTest : public ParquetWriterTest {
   auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
+// Typed test fixture for timestamp type tests
+template <typename T>
+struct ParquetWriterTimestampTypeTest : public ParquetWriterTest {
+  auto type() { return cudf::data_type{cudf::type_to_id<T>()}; }
+};
+
 // Declare typed test cases
 // TODO: Replace with `NumericTypes` when unsigned support is added. Issue #5352
 using SupportedTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t, bool, float, double>;
-TYPED_TEST_CASE(ParquetWriterNumericTypeTest, SupportedTypes);
+TYPED_TEST_SUITE(ParquetWriterNumericTypeTest, SupportedTypes);
 using SupportedChronoTypes = cudf::test::Concat<cudf::test::ChronoTypes, cudf::test::DurationTypes>;
-TYPED_TEST_CASE(ParquetWriterChronoTypeTest, SupportedChronoTypes);
+TYPED_TEST_SUITE(ParquetWriterChronoTypeTest, SupportedChronoTypes);
+// TODO: debug truncation errors for `timestamp_ns` and overflow errors for `timestamp_s` , see
+// issue #9393.
+using SupportedTimestampTypes = cudf::test::Types<cudf::timestamp_ms, cudf::timestamp_us>;
+TYPED_TEST_SUITE(ParquetWriterTimestampTypeTest, SupportedTimestampTypes);
 
 // Base test fixture for chunked writer tests
 struct ParquetChunkedWriterTest : public cudf::test::BaseFixture {
@@ -227,7 +237,7 @@ struct ParquetChunkedWriterNumericTypeTest : public ParquetChunkedWriterTest {
 };
 
 // Declare typed test cases
-TYPED_TEST_CASE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
+TYPED_TEST_SUITE(ParquetChunkedWriterNumericTypeTest, SupportedTypes);
 
 namespace {
 // Generates a vector of uniform random values of type T
@@ -363,6 +373,30 @@ TYPED_TEST(ParquetWriterChronoTypeTest, ChronosWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
 }
 
+TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
+{
+  constexpr int64_t max = std::numeric_limits<int64_t>::max();
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return max - i; });
+  auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, validity);
+  table_view expected({col});
+
+  auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
+  cudf_io::parquet_writer_options out_opts =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf_io::write_parquet(out_opts);
+
+  cudf_io::parquet_reader_options in_opts =
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+      .timestamp_type(this->type());
+  auto result = cudf_io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 TEST_F(ParquetWriterTest, MultiColumn)
 {
   constexpr auto num_rows = 100;
@@ -1049,12 +1083,21 @@ class custom_test_data_sink : public cudf::io::data_sink {
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    char* ptr = nullptr;
-    CUDA_TRY(cudaMallocHost(&ptr, size));
-    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
-    stream.synchronize();
-    outfile_.write(ptr, size);
-    CUDA_TRY(cudaFreeHost(ptr));
+    this->device_write_async(gpu_data, size, stream).get();
+  }
+
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
+    return std::async(std::launch::deferred, [=] {
+      char* ptr = nullptr;
+      CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      stream.synchronize();
+      outfile_.write(ptr, size);
+      CUDA_TRY(cudaFreeHost(ptr));
+    });
   }
 
   void flush() override { outfile_.flush(); }
@@ -2012,12 +2055,21 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
 
   void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
   {
-    char* ptr = nullptr;
-    CUDA_TRY(cudaMallocHost(&ptr, size));
-    CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
-    stream.synchronize();
-    mm_writer->host_write(ptr, size);
-    CUDA_TRY(cudaFreeHost(ptr));
+    this->device_write_async(gpu_data, size, stream).get();
+  }
+
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
+    return std::async(std::launch::deferred, [=] {
+      char* ptr = nullptr;
+      CUDA_TRY(cudaMallocHost(&ptr, size));
+      CUDA_TRY(cudaMemcpyAsync(ptr, gpu_data, size, cudaMemcpyDeviceToHost, stream.value()));
+      stream.synchronize();
+      mm_writer->host_write(ptr, size);
+      CUDA_TRY(cudaFreeHost(ptr));
+    });
   }
 
   void flush() override { mm_writer->flush(); }
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index d5379b6dd30..fd2cae3d344 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -26,7 +26,7 @@ template <typename T>
 struct IndexalatorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(IndexalatorTest, TestingTypes);
+TYPED_TEST_SUITE(IndexalatorTest, TestingTypes);
 
 TYPED_TEST(IndexalatorTest, input_iterator)
 {
diff --git a/cpp/tests/iterator/optional_iterator_test_chrono.cu b/cpp/tests/iterator/optional_iterator_test_chrono.cu
index c99814a3302..c7b73e2aa59 100644
--- a/cpp/tests/iterator/optional_iterator_test_chrono.cu
+++ b/cpp/tests/iterator/optional_iterator_test_chrono.cu
@@ -20,7 +20,7 @@ template <typename T>
 struct ChronoOptionalIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(ChronoOptionalIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(ChronoOptionalIteratorTest, TestingTypes);
 TYPED_TEST(ChronoOptionalIteratorTest, nonull_optional_iterator)
 {
   nonull_optional_iterator(*this);
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 313fd1358f6..6d51f4a5c14 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -20,7 +20,7 @@ template <typename T>
 struct NumericOptionalIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(NumericOptionalIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(NumericOptionalIteratorTest, TestingTypes);
 TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator)
 {
   nonull_optional_iterator(*this);
diff --git a/cpp/tests/iterator/pair_iterator_test_chrono.cu b/cpp/tests/iterator/pair_iterator_test_chrono.cu
index fb9cb645ab8..996be7b4278 100644
--- a/cpp/tests/iterator/pair_iterator_test_chrono.cu
+++ b/cpp/tests/iterator/pair_iterator_test_chrono.cu
@@ -20,6 +20,6 @@ template <typename T>
 struct ChronoPairIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(ChronoPairIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(ChronoPairIteratorTest, TestingTypes);
 TYPED_TEST(ChronoPairIteratorTest, nonull_pair_iterator) { nonull_pair_iterator(*this); }
 TYPED_TEST(ChronoPairIteratorTest, null_pair_iterator) { null_pair_iterator(*this); }
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 21d163a6979..6c09997456d 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -20,7 +20,7 @@ template <typename T>
 struct NumericPairIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(NumericPairIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(NumericPairIteratorTest, TestingTypes);
 TYPED_TEST(NumericPairIteratorTest, nonull_pair_iterator) { nonull_pair_iterator(*this); }
 TYPED_TEST(NumericPairIteratorTest, null_pair_iterator) { null_pair_iterator(*this); }
 
diff --git a/cpp/tests/iterator/scalar_iterator_test.cu b/cpp/tests/iterator/scalar_iterator_test.cu
index fb44423da27..3a394d30f97 100644
--- a/cpp/tests/iterator/scalar_iterator_test.cu
+++ b/cpp/tests/iterator/scalar_iterator_test.cu
@@ -14,9 +14,9 @@
  */
 #include <tests/iterator/iterator_tests.cuh>
 
-using TestingTypes = cudf::test::AllTypes;
+using TestingTypes = cudf::test::FixedWidthTypesWithoutFixedPoint;
 
-TYPED_TEST_CASE(IteratorTest, TestingTypes);
+TYPED_TEST_SUITE(IteratorTest, TestingTypes);
 
 TYPED_TEST(IteratorTest, scalar_iterator)
 {
diff --git a/cpp/tests/iterator/value_iterator_test_chrono.cu b/cpp/tests/iterator/value_iterator_test_chrono.cu
index 5cdb1f88874..96ce2149f71 100644
--- a/cpp/tests/iterator/value_iterator_test_chrono.cu
+++ b/cpp/tests/iterator/value_iterator_test_chrono.cu
@@ -23,6 +23,6 @@ template <typename T>
 struct ChronoValueIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(ChronoValueIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(ChronoValueIteratorTest, TestingTypes);
 TYPED_TEST(ChronoValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
 TYPED_TEST(ChronoValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/iterator/value_iterator_test_numeric.cu b/cpp/tests/iterator/value_iterator_test_numeric.cu
index f24dae995b3..5decb437b8f 100644
--- a/cpp/tests/iterator/value_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/value_iterator_test_numeric.cu
@@ -23,6 +23,6 @@ template <typename T>
 struct NumericValueIteratorTest : public IteratorTest<T> {
 };
 
-TYPED_TEST_CASE(NumericValueIteratorTest, TestingTypes);
+TYPED_TEST_SUITE(NumericValueIteratorTest, TestingTypes);
 TYPED_TEST(NumericValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
 TYPED_TEST(NumericValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index f6d76f9ea70..9f9547b06cf 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -62,21 +62,29 @@ auto left_zero_eq_right_zero =
 // Generate a single pair of left/right non-nullable columns of random data
 // suitable for testing a join against a reference join implementation.
 template <typename T>
-std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(unsigned int N = 10000,
-                                                                      unsigned int num_repeats = 10)
+std::pair<std::vector<T>, std::vector<T>> gen_random_repeated_columns(
+  unsigned int N_left            = 10000,
+  unsigned int num_repeats_left  = 10,
+  unsigned int N_right           = 10000,
+  unsigned int num_repeats_right = 10)
 {
   // Generate columns of num_repeats repeats of the integer range [0, num_unique),
   // then merge a shuffled version and compare to hash join.
-  unsigned int num_unique = N / num_repeats;
+  unsigned int num_unique_left  = N_left / num_repeats_left;
+  unsigned int num_unique_right = N_right / num_repeats_right;
 
-  std::vector<T> left(N);
-  std::vector<T> right(N);
+  std::vector<T> left(N_left);
+  std::vector<T> right(N_right);
 
-  for (unsigned int i = 0; i < num_repeats; ++i) {
-    std::iota(
-      std::next(left.begin(), num_unique * i), std::next(left.begin(), num_unique * (i + 1)), 0);
-    std::iota(
-      std::next(right.begin(), num_unique * i), std::next(right.begin(), num_unique * (i + 1)), 0);
+  for (unsigned int i = 0; i < num_repeats_left; ++i) {
+    std::iota(std::next(left.begin(), num_unique_left * i),
+              std::next(left.begin(), num_unique_left * (i + 1)),
+              0);
+  }
+  for (unsigned int i = 0; i < num_repeats_right; ++i) {
+    std::iota(std::next(right.begin(), num_unique_right * i),
+              std::next(right.begin(), num_unique_right * (i + 1)),
+              0);
   }
 
   std::random_device rd;
@@ -360,7 +368,7 @@ struct ConditionalInnerJoinTest : public ConditionalJoinPairReturnTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ConditionalInnerJoinTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(ConditionalInnerJoinTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnOneRowAllEqual)
 {
@@ -495,6 +503,18 @@ TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHashNulls)
   this->compare_to_hash_join_nulls({left}, {right});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHashNullsLargerLeft)
+{
+  auto [left, right] = gen_random_repeated_columns<TypeParam>(2000, 10, 1000, 10);
+  this->compare_to_hash_join({left}, {right});
+};
+
+TYPED_TEST(ConditionalInnerJoinTest, TestCompareRandomToHashNullsLargerRight)
+{
+  auto [left, right] = gen_random_repeated_columns<TypeParam>(1000, 10, 2000, 10);
+  this->compare_to_hash_join({left}, {right});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoNullsRowAllEqual)
 {
   this->test_nulls(
@@ -534,7 +554,7 @@ struct ConditionalLeftJoinTest : public ConditionalJoinPairReturnTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ConditionalLeftJoinTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(ConditionalLeftJoinTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(ConditionalLeftJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
@@ -592,7 +612,7 @@ struct ConditionalFullJoinTest : public ConditionalJoinPairReturnTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ConditionalFullJoinTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(ConditionalFullJoinTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(ConditionalFullJoinTest, TestOneColumnNoneEqual)
 {
@@ -769,7 +789,7 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
@@ -816,7 +836,7 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
diff --git a/cpp/tests/join/cross_join_tests.cpp b/cpp/tests/join/cross_join_tests.cpp
index 1c83e7208c1..75868dea972 100644
--- a/cpp/tests/join/cross_join_tests.cpp
+++ b/cpp/tests/join/cross_join_tests.cpp
@@ -33,7 +33,7 @@ template <typename T>
 class CrossJoinTypeTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(CrossJoinTypeTests, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(CrossJoinTypeTests, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(CrossJoinTypeTests, CrossJoin)
 {
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 8945f82baef..e6ae709f009 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -18,6 +18,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/join.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -30,6 +32,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -1415,6 +1418,19 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
   }
 }
 
+TEST_F(JoinTest, HashJoinLargeOutputSize)
+{
+  // self-join a table of zeroes to generate an output row count that would overflow int32_t
+  std::size_t col_size = 65567;
+  rmm::device_buffer zeroes(col_size * sizeof(int32_t), rmm::cuda_stream_default);
+  CUDA_TRY(cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), rmm::cuda_stream_default.value()));
+  cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data());
+  cudf::table_view tview{{col_zeros}};
+  cudf::hash_join hash_join(tview, cudf::null_equality::UNEQUAL);
+  std::size_t output_size = hash_join.inner_join_size(tview);
+  EXPECT_EQ(col_size * col_size, output_size);
+}
+
 struct JoinDictionaryTest : public cudf::test::BaseFixture {
 };
 
@@ -1434,21 +1450,25 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
   auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2});
-  {
-    auto result      = cudf::left_join(t0, t1, {0}, {0});
-    auto result_view = result->view();
-    auto decoded1    = cudf::dictionary::decode(result_view.column(1));
-    auto decoded4    = cudf::dictionary::decode(result_view.column(4));
-    std::vector<cudf::column_view> result_decoded({result_view.column(0),
-                                                   decoded1->view(),
-                                                   result_view.column(2),
-                                                   result_view.column(3),
-                                                   decoded4->view(),
-                                                   result_view.column(5)});
-
-    auto gold = cudf::left_join(g0, g1, {0}, {0});
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
-  }
+
+  auto result      = cudf::left_join(t0, t1, {0}, {0});
+  auto result_view = result->view();
+  auto decoded1    = cudf::dictionary::decode(result_view.column(1));
+  auto decoded4    = cudf::dictionary::decode(result_view.column(4));
+  std::vector<cudf::column_view> result_decoded({result_view.column(0),
+                                                 decoded1->view(),
+                                                 result_view.column(2),
+                                                 result_view.column(3),
+                                                 decoded4->view(),
+                                                 result_view.column(5)});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
+
+  auto gold            = cudf::left_join(g0, g1, {0}, {0});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
@@ -1476,11 +1496,16 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
                                                  result_view.column(3),
                                                  result_view.column(4),
                                                  decoded5->view()});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
-  auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::left_join(g0, g1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
+  auto g0              = cudf::table_view({col0_0, col0_1, col0_2_w});
+  auto g1              = cudf::table_view({col1_0, col1_1, col1_2_w});
+  auto gold            = cudf::left_join(g0, g1, {0, 1}, {0, 1});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
@@ -1508,11 +1533,16 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
                                                  result_view.column(3),
                                                  decoded4->view(),
                                                  result_view.column(5)});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
-  auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
+  auto g0              = cudf::table_view({col0_0, col0_1_w, col0_2});
+  auto g1              = cudf::table_view({col1_0, col1_1_w, col1_2});
+  auto gold            = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
@@ -1540,11 +1570,16 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
                                                  result_view.column(3),
                                                  result_view.column(4),
                                                  decoded5->view()});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto g0   = cudf::table_view({col0_0, col0_1, col0_2_w});
-  auto g1   = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
+  auto g0              = cudf::table_view({col0_0, col0_1, col0_2_w});
+  auto g1              = cudf::table_view({col1_0, col1_1, col1_2_w});
+  auto gold            = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinDictionaryTest, FullJoinNoNulls)
@@ -1572,11 +1607,16 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls)
                                                  result_view.column(3),
                                                  decoded4->view(),
                                                  result_view.column(5)});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto g0   = cudf::table_view({col0_0, col0_1_w, col0_2});
-  auto g1   = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
+  auto g0              = cudf::table_view({col0_0, col0_1_w, col0_2});
+  auto g1              = cudf::table_view({col1_0, col1_1_w, col1_2});
+  auto gold            = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinDictionaryTest, FullJoinWithNulls)
@@ -1604,11 +1644,16 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
                                                  decoded3->view(),
                                                  result_view.column(4),
                                                  result_view.column(5)});
+  auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
+  auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto g0   = cudf::table_view({col0_0_w, col0_1, col0_2});
-  auto g1   = cudf::table_view({col1_0_w, col1_1, col1_2});
-  auto gold = cudf::full_join(g0, g1, {0, 1}, {0, 1});
-  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
+  auto g0              = cudf::table_view({col0_0_w, col0_1, col0_2});
+  auto g1              = cudf::table_view({col1_0_w, col1_1, col1_2});
+  auto gold            = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  auto gold_sort_order = cudf::sorted_order(gold->view());
+  auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
 TEST_F(JoinTest, FullJoinWithStructsAndNulls)
@@ -1735,4 +1780,58 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
 
+TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
+{
+  // When joining on a STRUCT column, if the parent nulls are not reflected in
+  // the children, the join might produce incorrect results.
+  //
+  // In this test, a fact table of structs is joined against a dimension table.
+  // Both tables must match (only) on the NULL row. This will fail if the fact table's
+  // nulls are not pushed down into its children.
+  using ints    = column_wrapper<int32_t>;
+  using structs = cudf::test::structs_column_wrapper;
+  using namespace cudf::test::iterators;
+
+  auto make_table = [](auto&& col) {
+    auto columns = CVector{};
+    columns.push_back(std::move(col));
+    return cudf::table{std::move(columns)};
+  };
+
+  auto const fact_table = [make_table] {
+    auto fact_ints    = ints{0, 1, 2, 3, 4};
+    auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
+    // Now set struct validity to invalidate index#3.
+    cudf::detail::set_null_mask(fact_structs->mutable_view().null_mask(), 3, 4, false);
+    // Struct row#3 is null, but Struct.child has a non-null value.
+    return make_table(std::move(fact_structs));
+  }();
+
+  auto const dimension_table = [make_table] {
+    auto dim_ints    = ints{999};
+    auto dim_structs = structs{{dim_ints}, null_at(0)};
+    return make_table(dim_structs.release());
+  }();
+
+  auto const result = cudf::inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
+  EXPECT_EQ(result->num_rows(), 1);  // The null STRUCT rows should match.
+
+  // Note: Join result might not have nulls pushed down, since it's an output of gather().
+  // Must superimpose parent nulls before comparisons.
+  auto [superimposed_results, _] = cudf::structs::detail::superimpose_parent_nulls(*result);
+
+  auto const expected = [] {
+    auto fact_ints    = ints{0};
+    auto fact_structs = structs{{fact_ints}, null_at(0)};
+    auto dim_ints     = ints{0};
+    auto dim_structs  = structs{{dim_ints}, null_at(0)};
+    auto columns      = CVector{};
+    columns.push_back(fact_structs.release());
+    columns.push_back(dim_structs.release());
+    return cudf::table{std::move(columns)};
+  }();
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(superimposed_results, expected);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
index 34c8ff7251f..c884a4ced59 100644
--- a/cpp/tests/labeling/label_bins_tests.cpp
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -147,7 +147,7 @@ template <typename T>
 struct ExceptionCasesBinTestFixture : public GenericExceptionCasesBinTestFixture<T> {
 };
 
-TYPED_TEST_CASE(ExceptionCasesBinTestFixture, NumericTypesNotBool);
+TYPED_TEST_SUITE(ExceptionCasesBinTestFixture, NumericTypesNotBool);
 
 // Empty input must return an empty output.
 TYPED_TEST(ExceptionCasesBinTestFixture, TestEmptyInput)
@@ -179,7 +179,7 @@ template <typename T>
 struct NaNBinTestFixture : public GenericExceptionCasesBinTestFixture<T> {
 };
 
-TYPED_TEST_CASE(NaNBinTestFixture, FloatingPointTypes);
+TYPED_TEST_SUITE(NaNBinTestFixture, FloatingPointTypes);
 
 TYPED_TEST(NaNBinTestFixture, TestNaN)
 {
@@ -208,7 +208,7 @@ struct BoundaryExclusionBinTestFixture : public BinTestFixture {
   }
 };
 
-TYPED_TEST_CASE(BoundaryExclusionBinTestFixture, NumericTypesNotBool);
+TYPED_TEST_SUITE(BoundaryExclusionBinTestFixture, NumericTypesNotBool);
 
 // Boundary points when both bounds are excluded should be labeled null.
 TYPED_TEST(BoundaryExclusionBinTestFixture, TestNoIncludes)
@@ -301,7 +301,7 @@ struct RealDataBinTestFixture : public BinTestFixture {
   }
 };
 
-TYPED_TEST_CASE(RealDataBinTestFixture, NumericTypesNotBool);
+TYPED_TEST_SUITE(RealDataBinTestFixture, NumericTypesNotBool);
 
 TYPED_TEST(RealDataBinTestFixture, TestRealData256) { this->test(256); };
 TYPED_TEST(RealDataBinTestFixture, TestRealData512) { this->test(512); };
@@ -317,7 +317,7 @@ struct NegativeNumbersBinTestFixture : public RealDataBinTestFixture<T> {
   }
 };
 
-TYPED_TEST_CASE(NegativeNumbersBinTestFixture, SignedNumericTypesNotBool);
+TYPED_TEST_SUITE(NegativeNumbersBinTestFixture, SignedNumericTypesNotBool);
 
 TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers256) { this->test(256); };
 TYPED_TEST(NegativeNumbersBinTestFixture, TestNegativeNumbers512) { this->test(512); };
@@ -331,7 +331,7 @@ template <typename T>
 struct FixedPointBinTestFixture : public BinTestFixture {
 };
 
-TYPED_TEST_CASE(FixedPointBinTestFixture, FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointBinTestFixture, FixedPointTypes);
 
 TYPED_TEST(FixedPointBinTestFixture, TestFixedPointData)
 {
diff --git a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
index 0d988da34ea..ca25560141c 100644
--- a/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_list_elements_tests.cpp
@@ -64,7 +64,7 @@ struct ConcatenateListElementsTypedTest : public cudf::test::BaseFixture {
 using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                         cudf::test::FloatingPointTypes,
                                         cudf::test::FixedPointTypes>;
-TYPED_TEST_CASE(ConcatenateListElementsTypedTest, TypesForTest);
+TYPED_TEST_SUITE(ConcatenateListElementsTypedTest, TypesForTest);
 
 TYPED_TEST(ConcatenateListElementsTypedTest, SimpleInputNoNull)
 {
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index db145aa4e42..5d7e218898c 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -39,7 +39,7 @@ template <typename T>
 struct TypedContainsTest : public ContainsTest {
 };
 
-TYPED_TEST_CASE(TypedContainsTest, ContainsTestTypes);
+TYPED_TEST_SUITE(TypedContainsTest, ContainsTestTypes);
 
 namespace {
 template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
@@ -370,7 +370,7 @@ struct TypedVectorContainsTest : public ContainsTest {
 using VectorContainsTestTypes =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 
-TYPED_TEST_CASE(TypedVectorContainsTest, VectorContainsTestTypes);
+TYPED_TEST_SUITE(TypedVectorContainsTest, VectorContainsTestTypes);
 
 TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNoNulls)
 {
@@ -603,7 +603,7 @@ template <typename T>
 struct TypedContainsNaNsTest : public ContainsTest {
 };
 
-TYPED_TEST_CASE(TypedContainsNaNsTest, FloatingPointTypes);
+TYPED_TEST_SUITE(TypedContainsNaNsTest, FloatingPointTypes);
 
 template <typename T>
 T get_nan(const char* nan_contents)
@@ -713,7 +713,7 @@ template <typename T>
 struct TypedContainsDecimalsTest : public ContainsTest {
 };
 
-TYPED_TEST_CASE(TypedContainsDecimalsTest, FixedPointTypes);
+TYPED_TEST_SUITE(TypedContainsDecimalsTest, FixedPointTypes);
 
 TYPED_TEST(TypedContainsDecimalsTest, ListContainsScalar)
 {
diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp
index c5cb9d230c3..28d31d27ae5 100644
--- a/cpp/tests/lists/count_elements_tests.cpp
+++ b/cpp/tests/lists/count_elements_tests.cpp
@@ -32,7 +32,7 @@ template <typename T>
 class ListsElementsNumericsTest : public ListsElementsTest {
 };
 
-TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool);
+TYPED_TEST_SUITE(ListsElementsNumericsTest, NumericTypesNotBool);
 
 TYPED_TEST(ListsElementsNumericsTest, CountElements)
 {
diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
index 270e01075b9..8efcf8886ae 100644
--- a/cpp/tests/lists/drop_list_duplicates_tests.cpp
+++ b/cpp/tests/lists/drop_list_duplicates_tests.cpp
@@ -29,6 +29,7 @@
 using namespace cudf::test::iterators;
 
 using float_type    = float;
+using IntListsCol   = cudf::test::lists_column_wrapper<int32_t>;
 using FloatListsCol = cudf::test::lists_column_wrapper<float_type>;
 using StrListsCol   = cudf::test::lists_column_wrapper<cudf::string_view>;
 using StringsCol    = cudf::test::strings_column_wrapper;
@@ -48,27 +49,99 @@ struct DropListDuplicatesTest : public cudf::test::BaseFixture {
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero)
 {
   // -0.0 and 0.0 should be considered equal.
-  auto const lists    = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0};
-  auto const expected = FloatListsCol{0, 1, 2};
-  auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+  auto const keys = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3};
+  auto const vals =
+    StrListsCol{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"};
+  auto const expected_keys = FloatListsCol{0, 1, 2, 3};
+
+  // Remove duplicates only from keys.
+  {
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity);
+  }
+
+  // Remove duplicates with KEEP_FIRST.
+  {
+    auto const expected_vals = StrListsCol{"1", "2", "3", "14"};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_FIRST);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+  }
+
+  // Remove duplicates with KEEP_LAST.
+  {
+    auto const expected_vals = StrListsCol{"13", "8", "9", "14"};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_LAST);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+  }
+
+  // Remove duplicates with KEEP_NONE.
+  {
+    auto const expected_keys = FloatListsCol{3};
+    auto const expected_vals = StrListsCol{"14"};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_NONE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+  }
 }
 
 TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf)
 {
-  // Lists contain inf.
+  auto const keys          = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf};
+  auto const vals          = IntListsCol{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  auto const expected_keys = FloatListsCol{neg_Inf, 0, Inf};
+
+  // Remove duplicates only from keys.
   {
-    auto const lists    = FloatListsCol{0, 1, 2, 0, 1, 2, 0, 1, 2, Inf, Inf, Inf};
-    auto const expected = FloatListsCol{0, 1, 2, Inf};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity);
   }
+
+  // Remove duplicates with KEEP_FIRST.
   {
-    auto const lists    = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf};
-    auto const expected = FloatListsCol{neg_Inf, 0, Inf};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+    auto const expected_vals = IntListsCol{3, 2, 1};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_FIRST);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+  }
+
+  // Remove duplicates with KEEP_LAST.
+  {
+    auto const expected_vals = IntListsCol{11, 10, 9};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_LAST);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
   }
+
+  // Remove duplicates with KEEP_NONE.
+  {
+    auto const expected_keys = FloatListsCol{FloatListsCol{}};
+    auto const expected_vals = IntListsCol{IntListsCol{}};
+    auto const [results_keys, results_vals] =
+      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                        cudf::lists_column_view{vals},
+                                        cudf::duplicate_keep_option::KEEP_NONE);
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+  }
+  //  exit(0);
 }
 
 // The position of NaN is undefined after sorting, thus we need to offload the data to CPU to
@@ -235,10 +308,15 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
 
   // Empty input.
   {
-    auto const lists    = ListsCol{{}};
-    auto const expected = ListsCol{{}};
+    auto const lists    = ListsCol{};
+    auto const expected = ListsCol{};
     auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+
+    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
   }
 
   // Trivial cases.
@@ -247,6 +325,11 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
     auto const expected = ListsCol{0, 1, 2, 3, 4, 5};
     auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+
+    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
   }
 
   // Multiple empty lists.
@@ -255,6 +338,11 @@ TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
     auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}};
     auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+
+    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
+      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
   }
 }
 
@@ -264,11 +352,44 @@ TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests)
 
   // Adjacent lists containing the same entries.
   {
-    auto const lists =
+    auto const keys =
       ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}};
-    auto const expected = ListsCol{{1}, {1, 2}, {2, 3}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+    auto const vals =
+      ListsCol{{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}};
+    auto const expected_keys = ListsCol{{1}, {1, 2}, {2, 3}};
+
+    // Remove duplicates with KEEP_FIRST.
+    {
+      auto const expected_vals = ListsCol{{1}, {1, 6}, {1, 5}};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_FIRST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_LAST.
+    {
+      auto const expected_vals = ListsCol{{8}, {5, 8}, {4, 8}};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_LAST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_NONE.
+    {
+      auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_NONE);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
+    }
   }
 
   // Sliced list column.
@@ -315,26 +436,99 @@ TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests)
   using ListsCol      = cudf::test::lists_column_wrapper<TypeParam>;
   auto constexpr null = TypeParam{0};
 
-  // null lists.
+  // null entries and lists.
   {
-    auto const lists = ListsCol{
-      {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})};
-    auto const expected =
+    auto const keys = ListsCol{{{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}},
+                               nulls_at({2, 3})};
+    auto const vals =
+      ListsCol{{ListsCol{{1, 2, null, 4, 5}, null_at(2)}, {1}, {}, {} /*NULL*/, {1, 2, 3}, {1, 2}},
+               null_at(3)};
+    auto const expected_keys =
       ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+
+    // Remove duplicates with KEEP_FIRST.
+    {
+      auto const expected_vals =
+        ListsCol{{ListsCol{{null, 2, 1, 4}, null_at(0)}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}},
+                 null_at(3)};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_FIRST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_LAST.
+    {
+      auto const expected_vals =
+        ListsCol{{ListsCol{5, 2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_LAST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_NONE.
+    {
+      auto const expected_keys =
+        ListsCol{{{2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
+      auto const expected_vals =
+        ListsCol{{ListsCol{2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_NONE);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity);
+    }
   }
 
   // null entries are equal.
   {
-    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
-    auto const expected =
-      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null}, null_at(5)};
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
+    auto const keys =
+      ListsCol{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})};
+    auto const vals = ListsCol{{null, 1, 2, 3, 4, null, 6, 7, 8, null}, nulls_at({0, 5, 9})};
+    auto const expected_keys = ListsCol{{1, 3, 5, 7, 9, null}, null_at(5)};
+
+    // Remove duplicates with KEEP_FIRST.
+    {
+      auto const expected_vals = ListsCol{{1, 3, null, 7, null, null}, nulls_at({2, 4, 5})};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_FIRST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_LAST.
+    {
+      auto const expected_vals = ListsCol{{1, 3, null, 7, null, 8}, nulls_at({2, 4})};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_LAST);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
+
+    // Remove duplicates with KEEP_NONE.
+    {
+      auto const expected_keys = ListsCol{1, 3, 5, 7, 9};
+      auto const expected_vals = ListsCol{{1, 3, null, 7, null}, nulls_at({2, 4})};
+      auto const [results_keys, results_vals] =
+        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
+                                          cudf::lists_column_view{vals},
+                                          cudf::duplicate_keep_option::KEEP_NONE);
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_keys->view(), expected_keys, verbosity);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
+    }
   }
 
-  // nulls entries are not equal.
+  // null entries are not equal.
   {
     auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
     auto const expected =
@@ -558,7 +752,7 @@ TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
     cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {});
   auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0];  // test on the second list
 
-  // Contain expected values excluding NaN.
+  // Contain expected vals excluding NaN.
   auto const results_children_expected = std::unordered_set<float_type>{0, 1, 2};
 
   // Test for cudf::nan_equality::UNEQUAL.
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index 1685f2793ce..fd22932916f 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -40,9 +40,9 @@ template <typename T>
 class ExplodeOuterTypedTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ExplodeTypedTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(ExplodeTypedTest, cudf::test::FixedPointTypes);
 
-TYPED_TEST_CASE(ExplodeOuterTypedTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(ExplodeOuterTypedTest, cudf::test::FixedPointTypes);
 
 TEST_F(ExplodeTest, Empty)
 {
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index cf4ccd8ede4..d6ee62a7731 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -15,15 +15,20 @@
  */
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/extract.hpp>
 
 #include <tests/strings/utilities.h>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
+
 #include <vector>
 
 struct ListsExtractTest : public cudf::test::BaseFixture {
@@ -36,7 +41,7 @@ template <typename T>
 class ListsExtractNumericsTest : public ListsExtractTest {
 };
 
-TYPED_TEST_CASE(ListsExtractNumericsTest, NumericTypesNotBool);
+TYPED_TEST_SUITE(ListsExtractNumericsTest, NumericTypesNotBool);
 
 TYPED_TEST(ListsExtractNumericsTest, ExtractElement)
 {
@@ -209,6 +214,34 @@ TYPED_TEST(ListsExtractNumericsTest, ExtractElementNestedLists)
   }
 }
 
+TYPED_TEST(ListsExtractNumericsTest, ExtractElementsFromNonCompactedNullLists)
+{
+  using namespace cudf::test::iterators;
+  using indices       = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+  using lcw           = cudf::test::lists_column_wrapper<TypeParam, int32_t>;
+  using result_column = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+  auto constexpr X    = -1;  // Value indicating null.
+
+  auto input =
+    lcw{{{1, 2, 3}, {4, 5, 6}, {}, {7, 8, 9}, {0, 1, 2}, {}, {3, 4, 5}}, nulls_at({2, 5})}
+      .release();
+
+  // Set null at index 4.
+  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 4, 5, false);
+
+  {
+    auto result   = cudf::lists::extract_list_element(cudf::lists_column_view{*input}, 0);
+    auto expected = result_column{{1, 4, X, 7, X, X, 3}, nulls_at({2, 4, 5})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    auto index    = indices{0, 1, 2, 0, 1, 2, 0};
+    auto result   = cudf::lists::extract_list_element(cudf::lists_column_view{*input}, index);
+    auto expected = result_column{{1, 5, X, 7, X, X, 3}, nulls_at({2, 4, 5})};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
 TEST_F(ListsExtractTest, ExtractElementEmpty)
 {
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
@@ -221,12 +254,12 @@ TEST_F(ListsExtractTest, ExtractElementEmpty)
   LCW empty_strings({LCW{"", "", ""}});
   result = cudf::lists::extract_list_element(cudf::lists_column_view(empty_strings), 1);
   cudf::test::strings_column_wrapper expected({""});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
 
   LCW null_strings({LCW{"", "", ""}}, thrust::make_constant_iterator<int32_t>(0));
   result = cudf::lists::extract_list_element(cudf::lists_column_view(null_strings), 1);
   cudf::test::strings_column_wrapper expected_null({""}, {0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_null, *result);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_null, *result);
 }
 
 TEST_F(ListsExtractTest, ExtractElementWithNulls)
@@ -240,16 +273,187 @@ TEST_F(ListsExtractTest, ExtractElementWithNulls)
   {
     auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 0);
     cudf::test::strings_column_wrapper expected({"Héllo", "are", "some", "tést"});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), 1);
     cudf::test::strings_column_wrapper expected({"", "", "", "strings"}, {0, 0, 0, 1});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     auto result = cudf::lists::extract_list_element(cudf::lists_column_view(input), -1);
     cudf::test::strings_column_wrapper expected({"thesé", "are", "", "strings"}, {1, 1, 0, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
+  }
+}
+
+struct ListsExtractColumnIndicesTest : ListsExtractTest {
+};
+
+template <typename T>
+struct ListsExtractColumnIndicesTypedTest : ListsExtractColumnIndicesTest {
+};
+
+TYPED_TEST_SUITE(ListsExtractColumnIndicesTypedTest, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(ListsExtractColumnIndicesTypedTest, ExtractElement)
+{
+  using namespace cudf;
+  using namespace cudf::lists;
+  using namespace cudf::test;
+  using namespace cudf::test::iterators;
+  using LCW     = lists_column_wrapper<TypeParam, int32_t>;
+  using FWCW    = fixed_width_column_wrapper<TypeParam, int32_t>;
+  using indices = fixed_width_column_wrapper<offset_type>;
+
+  auto input_column =
+    LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}}, null_at(1));
+  auto input = lists_column_view(input_column);
+
+  {
+    // Test fetching first element.
+    auto result   = extract_list_element(input, indices{0, 0, 0, 0, 0, 0});
+    auto expected = FWCW({3, 0, 30, 100, 0, 0}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching last element.
+    auto result   = extract_list_element(input, indices{2, 0, 3, 1, 0, 0});
+    auto expected = FWCW({1, 0, 50, 120, 0, 0}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching *all* out of bounds.
+    auto result   = extract_list_element(input, indices{9, 9, 9, 9, 9, 9});
+    auto expected = FWCW({0, 0, 0, 0, 0, 0}, all_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching first from the end.
+    auto result   = extract_list_element(input, indices{-1, -1, -1, -1, -1, -1});
+    auto expected = FWCW({1, 0, 50, 120, 0, 0}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching last from the end.
+    auto result   = extract_list_element(input, indices{-3, 0, -4, -2, -1, 0});
+    auto expected = FWCW({3, 0, 30, 100, 0, 0}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching *all* negative out of bounds.
+    auto result   = extract_list_element(input, indices{-9, -9, -9, -9, -9, -9});
+    auto expected = FWCW({0, 0, 0, 0, 0, 0}, all_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test mixed indices.
+    auto result   = extract_list_element(input, indices{-2, 0, 3, -1, 0, 0});
+    auto expected = FWCW({2, 0, 50, 120, 0, 0}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test possibly null indices.
+    auto result   = extract_list_element(input, indices{{-2, 0, 3, -1, 0, 0}, nulls_at({2, 4})});
+    auto expected = FWCW({2, 0, 50, 120, 0, 0}, nulls_at({1, 2, 4, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TYPED_TEST(ListsExtractColumnIndicesTypedTest, FailureCases)
+{
+  using namespace cudf;
+  using namespace cudf::lists;
+  using namespace cudf::test;
+  using namespace cudf::test::iterators;
+  using LCW     = lists_column_wrapper<TypeParam, int32_t>;
+  using indices = fixed_width_column_wrapper<offset_type>;
+
+  {
+    // Non-empty input, with mismatched size of indices.
+    auto input_column =
+      LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}}, null_at(1));
+    auto input = lists_column_view(input_column);
+
+    EXPECT_THROW(extract_list_element(input, indices{0, 1, 2}), cudf::logic_error);
+  }
+  {
+    // Non-empty input, with empty indices.
+    auto input_column =
+      LCW({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}, LCW{}}, null_at(1));
+    auto input = lists_column_view(input_column);
+
+    EXPECT_THROW(extract_list_element(input, indices{}), cudf::logic_error);
+  }
+  {
+    // Empty input, with mismatched size of indices.
+    auto input_column = LCW{};
+    auto input        = lists_column_view(input_column);
+    EXPECT_THROW(extract_list_element(input, indices{0, 1, 2}), cudf::logic_error);
+  }
+}
+
+TEST_F(ListsExtractColumnIndicesTest, ExtractStrings)
+{
+  using namespace cudf;
+  using namespace cudf::lists;
+  using namespace cudf::test;
+  using namespace cudf::test::iterators;
+  using LCW     = lists_column_wrapper<string_view>;
+  using strings = strings_column_wrapper;
+  using indices = fixed_width_column_wrapper<offset_type>;
+
+  auto input_column = LCW(
+    {LCW{"3", "2", "1"}, LCW{}, LCW{"30", "20", "10", "50"}, LCW{"100", "120"}, LCW{"0"}, LCW{}},
+    null_at(1));
+  auto input = lists_column_view(input_column);
+
+  {
+    // Test fetching first element.
+    auto result   = extract_list_element(input, indices{0, 0, 0, 0, 0, 0});
+    auto expected = strings({"3", "", "30", "100", "0", ""}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching last element.
+    auto result   = extract_list_element(input, indices{2, 0, 3, 1, 0, 0});
+    auto expected = strings({"1", "", "50", "120", "0", ""}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching *all* out of bounds.
+    auto result   = extract_list_element(input, indices{9, 9, 9, 9, 9, 9});
+    auto expected = strings({"", "", "", "", "", ""}, all_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching first from the end.
+    auto result   = extract_list_element(input, indices{-1, -1, -1, -1, -1, -1});
+    auto expected = strings({"1", "", "50", "120", "0", ""}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching last from the end.
+    auto result   = extract_list_element(input, indices{-3, 0, -4, -2, -1, 0});
+    auto expected = strings({"3", "", "30", "100", "0", ""}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test fetching *all* negative out of bounds.
+    auto result   = extract_list_element(input, indices{-9, -9, -9, -9, -9, -9});
+    auto expected = strings({"", "", "", "", "", ""}, all_nulls());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test mixed indices.
+    auto result   = extract_list_element(input, indices{-2, 0, 3, -1, 0, 0});
+    auto expected = strings({"2", "", "50", "120", "0", ""}, nulls_at({1, 5}));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+  {
+    // Test possibly null indices.
+    auto result   = extract_list_element(input, indices{{-2, 0, 3, -1, 0, 0}, nulls_at({2, 4})});
+    auto expected = strings({"2", "", "50", "120", "", ""}, nulls_at({1, 2, 4, 5}));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
   }
 }
diff --git a/cpp/tests/lists/sort_lists_tests.cpp b/cpp/tests/lists/sort_lists_tests.cpp
index d085d02d67b..ade626a5c2b 100644
--- a/cpp/tests/lists/sort_lists_tests.cpp
+++ b/cpp/tests/lists/sort_lists_tests.cpp
@@ -17,35 +17,29 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/lists/sorting.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <exception>
-#include <type_traits>
-#include <vector>
 
 template <typename T>
 using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
-using cudf::lists_column_view;
-using cudf::lists::sort_lists;
 
 namespace cudf {
 namespace test {
 
+auto generate_sorted_lists(lists_column_view const& input,
+                           order column_order,
+                           null_order null_precedence)
+{
+  return std::pair{lists::sort_lists(input, column_order, null_precedence),
+                   lists::stable_sort_lists(input, column_order, null_precedence)};
+}
+
 template <typename T>
 struct SortLists : public BaseFixture {
 };
 
-TYPED_TEST_CASE(SortLists, NumericTypes);
+TYPED_TEST_SUITE(SortLists, NumericTypes);
 using SortListsInt = SortLists<int>;
 
 /*
@@ -73,20 +67,34 @@ TYPED_TEST(SortLists, NoNull)
   // Ascending
   // LCW<int>  order{{2, 1, 0, 3}, {0}, {1, 2, 0},  {0, 1}};
   LCW<T> expected{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
-  auto results = sort_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
-
-  results = sort_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  {
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
+  {
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
   // Descending
   // LCW<int>  order{{3, 0, 1, 2}, {0}, {0, 1, 2},  {1, 0}};
   LCW<T> expected2{{4, 3, 2, 1}, {5}, {10, 9, 8}, {7, 6}};
-  results = sort_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2);
-
-  results = sort_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2);
+  {
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected2);
+  }
+  {
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected2);
+  }
 }
 
 TYPED_TEST(SortLists, Null)
@@ -100,60 +108,99 @@ TYPED_TEST(SortLists, Null)
   // List<T>
   LCW<T> list{{{3, 2, 4, 1}, valids_o.begin()}, {5}, {10, 8, 9}, {6, 7}};
   // LCW<int>  order{{2, 1, 3, 0}, {0}, {1, 2, 0},  {0, 1}};
-  LCW<T> expected1{{{1, 2, 3, 4}, valids_a.begin()}, {5}, {8, 9, 10}, {6, 7}};
-  LCW<T> expected2{{{4, 1, 2, 3}, valids_b.begin()}, {5}, {8, 9, 10}, {6, 7}};
-  auto results = sort_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected1);
 
-  results = sort_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2);
+  {
+    LCW<T> expected{{{1, 2, 3, 4}, valids_a.begin()}, {5}, {8, 9, 10}, {6, 7}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::AFTER);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
+
+  {
+    LCW<T> expected{{{4, 1, 2, 3}, valids_b.begin()}, {5}, {8, 9, 10}, {6, 7}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::ASCENDING, null_order::BEFORE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
   // Descending
   // LCW<int>  order{{3, 0, 1, 2}, {0}, {0, 1, 2},  {1, 0}};
-  LCW<T> expected3{{{4, 3, 2, 1}, valids_b.begin()}, {5}, {10, 9, 8}, {7, 6}};
-  LCW<T> expected4{{{3, 2, 1, 4}, valids_a.begin()}, {5}, {10, 9, 8}, {7, 6}};
-  results = sort_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3);
+  {
+    LCW<T> expected{{{4, 3, 2, 1}, valids_b.begin()}, {5}, {10, 9, 8}, {7, 6}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::AFTER);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
-  results = sort_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4);
+  {
+    LCW<T> expected{{{3, 2, 1, 4}, valids_a.begin()}, {5}, {10, 9, 8}, {7, 6}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{list}, order::DESCENDING, null_order::BEFORE);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 }
 
 TEST_F(SortListsInt, Empty)
 {
   using T = int;
-  LCW<T> l1{};
-  LCW<T> l2{LCW<T>{}};
-  LCW<T> l3{LCW<T>{}, LCW<T>{}};
-
-  auto results = sort_lists(lists_column_view{l1}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l1);
-  results = sort_lists(lists_column_view{l2}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l2);
-  results = sort_lists(lists_column_view{l3}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l3);
+
+  {
+    LCW<T> l{};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{l}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
+  }
+  {
+    LCW<T> l{LCW<T>{}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{l}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
+  }
+  {
+    LCW<T> l{LCW<T>{}, LCW<T>{}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{l}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
+  }
 }
 
 TEST_F(SortListsInt, Single)
 {
   using T = int;
-  LCW<T> l1{{1}};
-  LCW<T> l2{{1, 2, 3}};
 
-  auto results = sort_lists(lists_column_view{l1}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l1);
-  results = sort_lists(lists_column_view{l2}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l2);
+  {
+    LCW<T> l{1};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{l}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
+  }
+  {
+    LCW<T> l{{1, 2, 3}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{l}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
+  }
 }
 
 TEST_F(SortListsInt, NullRows)
 {
   using T = int;
   std::vector<int> valids{0, 1, 0};
-  LCW<T> l1{{{1, 2, 3}, {4, 5, 6}, {7}}, valids.begin()};  // offset 0, 0, 3, 3
+  LCW<T> l{{{1, 2, 3}, {4, 5, 6}, {7}}, valids.begin()};  // offset 0, 0, 3, 3
 
-  auto results = sort_lists(lists_column_view{l1}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), l1);
+  auto const [sorted_lists, stable_sorted_lists] =
+    generate_sorted_lists(lists_column_view{l}, {}, {});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), l);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), l);
 }
 
 /*
@@ -171,23 +218,43 @@ TEST_F(SortListsInt, Depth)
 TEST_F(SortListsInt, Sliced)
 {
   using T = int;
-  LCW<T> l1{{3, 2, 1, 4}, {7, 5, 6}, {8, 9}, {10}};
+  LCW<T> l{{3, 2, 1, 4}, {7, 5, 6}, {8, 9}, {10}};
 
-  auto sliced_list = cudf::slice(l1, {0, 4})[0];
-  auto results     = sort_lists(lists_column_view{sliced_list}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}});
+  {
+    auto const sliced_list = cudf::slice(l, {0, 4})[0];
+    auto const expected    = LCW<T>{{1, 2, 3, 4}, {5, 6, 7}, {8, 9}, {10}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{sliced_list}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
-  sliced_list = cudf::slice(l1, {1, 4})[0];
-  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{5, 6, 7}, {8, 9}, {10}});
+  {
+    auto const sliced_list = cudf::slice(l, {1, 4})[0];
+    auto const expected    = LCW<T>{{5, 6, 7}, {8, 9}, {10}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{sliced_list}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
-  sliced_list = cudf::slice(l1, {1, 2})[0];
-  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{5, 6, 7}});
+  {
+    auto const sliced_list = cudf::slice(l, {1, 2})[0];
+    auto const expected    = LCW<T>{{5, 6, 7}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{sliced_list}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 
-  sliced_list = cudf::slice(l1, {0, 2})[0];
-  results     = sort_lists(lists_column_view{sliced_list}, {}, {});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), LCW<T>{{1, 2, 3, 4}, {5, 6, 7}});
+  {
+    auto const sliced_list = cudf::slice(l, {0, 2})[0];
+    auto const expected    = LCW<T>{{1, 2, 3, 4}, {5, 6, 7}};
+    auto const [sorted_lists, stable_sorted_lists] =
+      generate_sorted_lists(lists_column_view{sliced_list}, {}, {});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_lists->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(stable_sorted_lists->view(), expected);
+  }
 }
 
 }  // namespace test
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 0cd5d68ea39..b3e3907a088 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -44,7 +44,7 @@ template <typename T>
 class MergeStringTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(MergeStringTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(MergeStringTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(MergeStringTest, Merge1StringKeyColumns)
 {
diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp
index 452f3adfdbb..0bc5d047612 100644
--- a/cpp/tests/merge/merge_test.cpp
+++ b/cpp/tests/merge/merge_test.cpp
@@ -36,7 +36,7 @@ template <typename T>
 class MergeTest_ : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(MergeTest_, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(MergeTest_, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(MergeTest_, MergeIsZeroWhenShouldNotBeZero)
 {
@@ -880,7 +880,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointMerge)
 {
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 1a39c7701f6..9eb9814373e 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -247,7 +247,7 @@ template <typename T>
 class HashPartitionFixedWidth : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(HashPartitionFixedWidth, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(HashPartitionFixedWidth, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(HashPartitionFixedWidth, NullableFixedWidth)
 {
diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp
index 669d406d80a..785af409c4c 100644
--- a/cpp/tests/partitioning/partition_test.cpp
+++ b/cpp/tests/partitioning/partition_test.cpp
@@ -35,7 +35,7 @@ using types =
 
 // using types = cudf::test::Types<cudf::test::Types<int32_t, int32_t> >;
 
-TYPED_TEST_CASE(PartitionTest, types);
+TYPED_TEST_SUITE(PartitionTest, types);
 
 using cudf::test::fixed_width_column_wrapper;
 using cudf::test::strings_column_wrapper;
@@ -234,7 +234,7 @@ template <typename T>
 class PartitionTestFixedPoint : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(PartitionTestFixedPoint, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(PartitionTestFixedPoint, cudf::test::FixedPointTypes);
 
 TYPED_TEST(PartitionTestFixedPoint, Partition)
 {
diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp
index 160365834fe..a72c22f5714 100644
--- a/cpp/tests/partitioning/round_robin_test.cpp
+++ b/cpp/tests/partitioning/round_robin_test.cpp
@@ -45,7 +45,7 @@ template <typename T>
 class RoundRobinTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(RoundRobinTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(RoundRobinTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 {
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cu
index 39f7cc593d6..2f4d5a7a604 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cu
+++ b/cpp/tests/quantiles/percentile_approx_test.cu
@@ -4,6 +4,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/quantiles.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 
@@ -17,6 +18,7 @@
 #include <tests/groupby/groupby_test_util.hpp>
 
 using namespace cudf;
+using namespace cudf::tdigest;
 
 struct tdigest_gen {
   template <
@@ -116,8 +118,8 @@ struct percentile_approx_dispatch {
 
     cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                  percentages.end());
-    structs_column_view scv(*(gb_result.second[0].results[0]));
-    auto result = cudf::percentile_approx(scv, g_percentages);
+    tdigest_column_view tdv(*(gb_result.second[0].results[0]));
+    auto result = cudf::percentile_approx(tdv, g_percentages);
 
     cudf::test::expect_columns_equivalent(
       *expected, *result, cudf::test::debug_output_level::FIRST_ERROR, ulps);
@@ -194,8 +196,8 @@ void percentile_approx_test(column_view const& _keys,
 
   cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                percentages.end());
-  structs_column_view scv(*(gb_result.second[0].results[0]));
-  auto result = cudf::percentile_approx(scv, g_percentages);
+  tdigest_column_view tdv(*(gb_result.second[0].results[0]));
+  auto result = cudf::percentile_approx(tdv, g_percentages);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
 }
@@ -304,7 +306,7 @@ using PercentileApproxTypes =
 template <typename T>
 struct PercentileApproxInputTypesTest : public cudf::test::BaseFixture {
 };
-TYPED_TEST_CASE(PercentileApproxInputTypesTest, PercentileApproxTypes);
+TYPED_TEST_SUITE(PercentileApproxInputTypesTest, PercentileApproxTypes);
 
 TYPED_TEST(PercentileApproxInputTypesTest, Simple)
 {
@@ -364,15 +366,15 @@ TEST_F(PercentileApproxTest, EmptyInput)
   input.push_back(*empty_);
   auto empty = cudf::concatenate(input);
 
-  structs_column_view scv(*empty);
-  auto result = cudf::percentile_approx(scv, percentiles);
+  tdigest_column_view tdv(*empty);
+  auto result = cudf::percentile_approx(tdv, percentiles);
 
   cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
   std::vector<bool> nulls{0, 0, 0};
   auto expected =
     cudf::make_lists_column(3,
                             offsets.release(),
-                            cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                            cudf::make_empty_column(type_id::FLOAT64),
                             3,
                             cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
 
@@ -395,13 +397,13 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
 
   cudf::test::fixed_width_column_wrapper<double> percentiles{};
 
-  structs_column_view scv(*tdigest_column.second[0].results[0]);
-  auto result = cudf::percentile_approx(scv, percentiles);
+  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
+  auto result = cudf::percentile_approx(tdv, percentiles);
 
   cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
   auto expected = cudf::make_lists_column(2,
                                           offsets.release(),
-                                          cudf::make_empty_column(data_type{type_id::FLOAT64}),
+                                          cudf::make_empty_column(type_id::FLOAT64),
                                           2,
                                           cudf::detail::create_null_mask(2, mask_state::ALL_NULL));
 
@@ -422,14 +424,14 @@ TEST_F(PercentileApproxTest, NullPercentiles)
   requests.push_back({values, std::move(aggregations)});
   auto tdigest_column = gb.aggregate(requests);
 
-  structs_column_view scv(*tdigest_column.second[0].results[0]);
+  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
 
   cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
-  auto result = cudf::percentile_approx(scv, npercentiles);
+  auto result = cudf::percentile_approx(tdv, npercentiles);
 
   std::vector<bool> valids{0, 0, 1, 1};
   cudf::test::lists_column_wrapper<double> expected{{{99, 99, 4, 4}, valids.begin()},
                                                     {{99, 99, 8, 8}, valids.begin()}};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index c1fbde4e9ae..d4cc0ec4738 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -391,7 +391,7 @@ struct QuantileTest : public BaseFixture {
 };
 
 using TestTypes = NumericTypes;
-TYPED_TEST_CASE(QuantileTest, TestTypes);
+TYPED_TEST_SUITE(QuantileTest, TestTypes);
 
 TYPED_TEST(QuantileTest, TestSingle) { test(testdata::single<TypeParam>()); }
 
@@ -424,8 +424,9 @@ template <typename T>
 struct QuantileUnsupportedTypesTest : public BaseFixture {
 };
 
-using UnsupportedTestTypes = RemoveIf<ContainedIn<TestTypes>, AllTypes>;
-TYPED_TEST_CASE(QuantileUnsupportedTypesTest, UnsupportedTestTypes);
+// TODO add tests for FixedPointTypes
+using UnsupportedTestTypes = RemoveIf<ContainedIn<Concat<TestTypes, FixedPointTypes>>, AllTypes>;
+TYPED_TEST_SUITE(QuantileUnsupportedTypesTest, UnsupportedTestTypes);
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestZeroElements)
 {
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index 5633e03cc90..b4d1b9984ab 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -38,7 +38,7 @@ struct QuantilesTest : public BaseFixture {
 
 using TestTypes = AllTypes;
 
-TYPED_TEST_CASE(QuantilesTest, TestTypes);
+TYPED_TEST_SUITE(QuantilesTest, TestTypes);
 
 TYPED_TEST(QuantilesTest, TestZeroColumns)
 {
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
new file mode 100644
index 00000000000..2249fac4e2e
--- /dev/null
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan_tests.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+
+using aggregation = cudf::aggregation;
+using cudf::null_policy;
+using cudf::scan_type;
+using namespace cudf::test::iterators;
+
+template <typename T>
+struct TypedRankScanTest : BaseScanTest<T> {
+  inline void test_ungrouped_rank_scan(cudf::column_view const& input,
+                                       cudf::column_view const& expect_vals,
+                                       std::unique_ptr<aggregation> const& agg,
+                                       null_policy null_handling)
+  {
+    auto col_out = cudf::scan(input, agg, scan_type::INCLUSIVE, null_handling);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
+  }
+};
+
+using RankTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
+                                     cudf::test::FloatingPointTypes,
+                                     cudf::test::FixedPointTypes,
+                                     cudf::test::ChronoTypes,
+                                     cudf::test::StringTypes>;
+
+TYPED_TEST_SUITE(TypedRankScanTest, RankTypes);
+
+TYPED_TEST(TypedRankScanTest, Rank)
+{
+  auto const v = [] {
+    if (std::is_signed<TypeParam>::value)
+      return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
+    return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
+  }();
+  auto col = this->make_column(v);
+
+  auto const expected_dense_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6};
+  auto const expected_rank_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12};
+  this->test_ungrouped_rank_scan(
+    *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+}
+
+TYPED_TEST(TypedRankScanTest, RankWithNulls)
+{
+  auto const v = [] {
+    if (std::is_signed<TypeParam>::value)
+      return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
+    return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
+  }();
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0});
+  auto col     = this->make_column(v, b);
+
+  auto const expected_dense_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8};
+  auto const expected_rank_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12};
+  this->test_ungrouped_rank_scan(
+    *col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+}
+
+TYPED_TEST(TypedRankScanTest, MixedStructs)
+{
+  auto const v = [] {
+    if (std::is_signed<TypeParam>::value)
+      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
+    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
+  }();
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+  auto col     = this->make_column(v, b);
+  auto strings = cudf::test::strings_column_wrapper{
+    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+  std::vector<std::unique_ptr<cudf::column>> vector_of_columns;
+  vector_of_columns.push_back(std::move(col));
+  vector_of_columns.push_back(strings.release());
+  auto struct_col = cudf::test::structs_column_wrapper{std::move(vector_of_columns)}.release();
+
+  auto expected_dense_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8};
+  auto expected_rank_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12};
+
+  this->test_ungrouped_rank_scan(
+    *struct_col, expected_dense_vals, cudf::make_dense_rank_aggregation(), null_policy::INCLUDE);
+  this->test_ungrouped_rank_scan(
+    *struct_col, expected_rank_vals, cudf::make_rank_aggregation(), null_policy::INCLUDE);
+}
+
+TYPED_TEST(TypedRankScanTest, NestedStructs)
+{
+  auto const v = [] {
+    if (std::is_signed<TypeParam>::value)
+      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
+    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
+  }();
+  auto const b  = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+  auto col1     = this->make_column(v, b);
+  auto col2     = this->make_column(v, b);
+  auto col3     = this->make_column(v, b);
+  auto col4     = this->make_column(v, b);
+  auto strings1 = cudf::test::strings_column_wrapper{
+    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+  auto strings2 = cudf::test::strings_column_wrapper{
+    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+
+  std::vector<std::unique_ptr<cudf::column>> struct_columns;
+  struct_columns.push_back(std::move(col1));
+  struct_columns.push_back(strings1.release());
+  auto struct_col = cudf::test::structs_column_wrapper{std::move(struct_columns)};
+  std::vector<std::unique_ptr<cudf::column>> nested_columns;
+  nested_columns.push_back(struct_col.release());
+  nested_columns.push_back(std::move(col2));
+  auto nested_col = cudf::test::structs_column_wrapper{std::move(nested_columns)};
+  std::vector<std::unique_ptr<cudf::column>> flat_columns;
+  flat_columns.push_back(std::move(col3));
+  flat_columns.push_back(strings2.release());
+  flat_columns.push_back(std::move(col4));
+  auto flat_col = cudf::test::structs_column_wrapper{std::move(flat_columns)};
+
+  auto dense_out = cudf::scan(
+    nested_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto dense_expected = cudf::scan(
+    flat_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto rank_out = cudf::scan(
+    nested_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto rank_expected =
+    cudf::scan(flat_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view());
+}
+
+TYPED_TEST(TypedRankScanTest, structsWithNullPushdown)
+{
+  auto const v = [] {
+    if (std::is_signed<TypeParam>::value)
+      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
+    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
+  }();
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+  auto col     = this->make_column(v, b);
+  auto strings = cudf::test::strings_column_wrapper{
+    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
+  std::vector<std::unique_ptr<cudf::column>> struct_columns;
+  struct_columns.push_back(std::move(col));
+  struct_columns.push_back(strings.release());
+
+  auto struct_col =
+    cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{});
+
+  struct_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL));
+  auto expected_null_result =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto dense_null_out = cudf::scan(
+    *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto rank_null_out = cudf::scan(
+    *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_null_out->view(), expected_null_result);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_null_out->view(), expected_null_result);
+
+  auto const struct_nulls =
+    thrust::host_vector<bool>(std::vector<bool>{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  struct_col->set_null_mask(
+    cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end()));
+  auto expected_dense_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9};
+  auto expected_rank_vals =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12};
+  auto dense_out = cudf::scan(
+    *struct_col, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto rank_out = cudf::scan(
+    *struct_col, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense_vals);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank_vals);
+}
+
+struct RankScanTest : public cudf::test::BaseFixture {
+};
+
+TEST(RankScanTest, BoolRank)
+{
+  cudf::test::fixed_width_column_wrapper<bool> vals{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_dense_vals{
+    1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_rank_vals{
+    1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4};
+
+  auto dense_out = cudf::scan(
+    vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto rank_out =
+    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, dense_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, rank_out->view());
+}
+
+TEST(RankScanTest, BoolRankWithNull)
+{
+  cudf::test::fixed_width_column_wrapper<bool> vals{{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34},
+                                                    {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  cudf::table_view order_table{std::vector<cudf::column_view>{vals}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_dense_vals{
+    1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_rank_vals{
+    1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9};
+
+  auto nullable_dense_out = cudf::scan(
+    vals, cudf::make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  auto nullable_rank_out =
+    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, nullable_dense_out->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, nullable_rank_out->view());
+}
+
+TEST(RankScanTest, ExclusiveScan)
+{
+  cudf::test::fixed_width_column_wrapper<uint32_t> vals{3, 4, 5};
+  cudf::test::fixed_width_column_wrapper<uint32_t> order_col{3, 3, 1};
+  cudf::table_view order_table{std::vector<cudf::column_view>{order_col}};
+
+  CUDF_EXPECT_THROW_MESSAGE(
+    cudf::scan(
+      vals, cudf::make_dense_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
+    "Unsupported dense rank aggregation operator for exclusive scan");
+  CUDF_EXPECT_THROW_MESSAGE(
+    cudf::scan(vals, cudf::make_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
+    "Unsupported rank aggregation operator for exclusive scan");
+}
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 88318a41882..e3a7a378d35 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -108,7 +108,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
       using ScalarType                     = cudf::scalar_type_t<T_out>;
       auto result1                         = static_cast<ScalarType*>(result.get());
       EXPECT_EQ(expected_null, !result1->is_valid());
-      if (result1->is_valid()) { EXPECT_EQ(expected_value, result1->value()); }
+      if (result1->is_valid()) { EXPECT_EQ(expected_value, T_out{result1->value()}); }
     };
 
     if (succeeded_condition) {
@@ -124,7 +124,7 @@ struct MinMaxReductionTest : public ReductionTest<T> {
 };
 
 using MinMaxTypes = cudf::test::AllTypes;
-TYPED_TEST_CASE(MinMaxReductionTest, MinMaxTypes);
+TYPED_TEST_SUITE(MinMaxReductionTest, MinMaxTypes);
 
 // ------------------------------------------------------------------------
 TYPED_TEST(MinMaxReductionTest, MinMax)
@@ -152,8 +152,8 @@ TYPED_TEST(MinMaxReductionTest, MinMax)
   using ScalarType = cudf::scalar_type_t<T>;
   auto min_result  = static_cast<ScalarType*>(res.first.get());
   auto max_result  = static_cast<ScalarType*>(res.second.get());
-  EXPECT_EQ(min_result->value(), expected_min_result);
-  EXPECT_EQ(max_result->value(), expected_max_result);
+  EXPECT_EQ(T{min_result->value()}, expected_min_result);
+  EXPECT_EQ(T{max_result->value()}, expected_max_result);
 
   // test with some nulls
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
@@ -174,8 +174,8 @@ TYPED_TEST(MinMaxReductionTest, MinMax)
   using ScalarType     = cudf::scalar_type_t<T>;
   auto min_null_result = static_cast<ScalarType*>(null_res.first.get());
   auto max_null_result = static_cast<ScalarType*>(null_res.second.get());
-  EXPECT_EQ(min_null_result->value(), expected_min_null_result);
-  EXPECT_EQ(max_null_result->value(), expected_max_null_result);
+  EXPECT_EQ(T{min_null_result->value()}, expected_min_null_result);
+  EXPECT_EQ(T{max_null_result->value()}, expected_max_null_result);
 
   // test with all null
   cudf::test::fixed_width_column_wrapper<T> col_all_nulls = construct_null_column(v, all_null);
@@ -214,7 +214,7 @@ template <typename T>
 struct SumReductionTest : public ReductionTest<T> {
 };
 using SumTypes = cudf::test::Concat<cudf::test::NumericTypes, cudf::test::DurationTypes>;
-TYPED_TEST_CASE(SumReductionTest, SumTypes);
+TYPED_TEST_SUITE(SumReductionTest, SumTypes);
 
 TYPED_TEST(SumReductionTest, Sum)
 {
@@ -237,7 +237,7 @@ TYPED_TEST(SumReductionTest, Sum)
     col_nulls, expected_null_value, this->ret_non_arithmetic, cudf::make_sum_aggregation());
 }
 
-TYPED_TEST_CASE(ReductionTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(ReductionTest, cudf::test::NumericTypes);
 
 TYPED_TEST(ReductionTest, Product)
 {
@@ -302,7 +302,7 @@ template <typename T>
 struct ReductionAnyAllTest : public ReductionTest<bool> {
 };
 
-TYPED_TEST_CASE(ReductionAnyAllTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(ReductionAnyAllTest, cudf::test::NumericTypes);
 
 TYPED_TEST(ReductionAnyAllTest, AnyAllTrueTrue)
 {
@@ -367,7 +367,7 @@ struct MultiStepReductionTest : public ReductionTest<T> {
 };
 
 using MultiStepReductionTypes = cudf::test::NumericTypes;
-TYPED_TEST_CASE(MultiStepReductionTest, MultiStepReductionTypes);
+TYPED_TEST_SUITE(MultiStepReductionTest, MultiStepReductionTypes);
 
 TYPED_TEST(MultiStepReductionTest, Mean)
 {
@@ -477,7 +477,7 @@ struct ReductionMultiStepErrorCheck : public ReductionTest<T> {
   }
 };
 
-TYPED_TEST_CASE(ReductionMultiStepErrorCheck, cudf::test::AllTypes);
+TYPED_TEST_SUITE(ReductionMultiStepErrorCheck, cudf::test::AllTypes);
 
 // This test is disabled for only a Debug build because a compiler error
 // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu
@@ -558,6 +558,20 @@ struct ReductionDtypeTest : public cudf::test::BaseFixture {
   }
 };
 
+TEST_F(ReductionDtypeTest, all_null_output)
+{
+  auto sum_agg = cudf::make_sum_aggregation();
+
+  auto const col =
+    cudf::test::fixed_point_column_wrapper<int32_t>{{0, 0, 0}, {0, 0, 0}, numeric::scale_type{-2}}
+      .release();
+
+  std::unique_ptr<cudf::scalar> result = cudf::reduce(*col, sum_agg, col->type());
+  EXPECT_EQ(result->is_valid(), false);
+  EXPECT_EQ(result->type().id(), col->type().id());
+  EXPECT_EQ(result->type().scale(), col->type().scale());
+}
+
 // test case for different output precision
 TEST_F(ReductionDtypeTest, different_precision)
 {
@@ -1046,7 +1060,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointReductionProductZeroScale)
 {
@@ -1515,7 +1529,7 @@ template <typename T>
 struct DictionaryAnyAllTest : public ReductionTest<bool> {
 };
 
-TYPED_TEST_CASE(DictionaryAnyAllTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(DictionaryAnyAllTest, cudf::test::NumericTypes);
 TYPED_TEST(DictionaryAnyAllTest, AnyAll)
 {
   using T = TypeParam;
@@ -1581,7 +1595,7 @@ struct DictionaryReductionTest : public ReductionTest<T> {
 };
 
 using DictionaryTypes = cudf::test::Types<int16_t, uint32_t, float, double>;
-TYPED_TEST_CASE(DictionaryReductionTest, DictionaryTypes);
+TYPED_TEST_SUITE(DictionaryReductionTest, DictionaryTypes);
 TYPED_TEST(DictionaryReductionTest, Sum)
 {
   using T = TypeParam;
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index ab6eb144be5..d1e983460d5 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -14,129 +14,56 @@
  * limitations under the License.
  */
 
-#include <cudf_test/base_fixture.hpp>
+#include <tests/reductions/scan_tests.hpp>
+
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
 
-#include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
-#include <cudf/strings/string_view.hpp>
-#include <cudf/utilities/traits.hpp>
 
-#include <thrust/host_vector.h>
+#include <thrust/iterator/zip_iterator.h>
 
 #include <algorithm>
-#include <cstdlib>
-#include <initializer_list>
-#include <iostream>
-#include <iterator>
 #include <numeric>
-#include <type_traits>
-#include <vector>
 
 using aggregation = cudf::aggregation;
-using cudf::column_view;
 using cudf::null_policy;
 using cudf::scan_type;
-using namespace cudf::test::iterators;
-
-namespace cudf {
-namespace test {
-
-template <typename T>
-struct TypeParam_to_host_type {
-  using type = T;
-};
-
-template <>
-struct TypeParam_to_host_type<string_view> {
-  using type = std::string;
-};
-
-template <>
-struct TypeParam_to_host_type<numeric::decimal32> {
-  using type = numeric::decimal32::rep;
-};
-
-template <>
-struct TypeParam_to_host_type<numeric::decimal64> {
-  using type = numeric::decimal64::rep;
-};
-
-template <typename TypeParam, typename T>
-typename std::enable_if<std::is_same_v<TypeParam, string_view>,
-                        thrust::host_vector<std::string>>::type
-make_vector(std::initializer_list<T> const& init)
-{
-  return make_type_param_vector<std::string, T>(init);
-}
-
-template <typename TypeParam, typename T>
-typename std::enable_if<is_fixed_point<TypeParam>(),
-                        thrust::host_vector<typename TypeParam::rep>>::type
-make_vector(std::initializer_list<T> const& init)
-{
-  return make_type_param_vector<typename TypeParam::rep, T>(init);
-}
-
-template <typename TypeParam, typename T>
-typename std::enable_if<not(std::is_same_v<TypeParam, string_view> || is_fixed_point<TypeParam>()),
-                        thrust::host_vector<TypeParam>>::type
-make_vector(std::initializer_list<T> const& init)
-{
-  return make_type_param_vector<TypeParam, T>(init);
-}
 
 // This is the main test feature
 template <typename T>
-struct ScanTest : public BaseFixture {
-  typedef typename TypeParam_to_host_type<T>::type HostType;
+struct ScanTest : public BaseScanTest<T> {
+  using HostType = typename BaseScanTest<T>::HostType;
 
-  void scan_test(host_span<HostType const> v,
-                 host_span<bool const> b,
+  void scan_test(cudf::host_span<HostType const> v,
+                 cudf::host_span<bool const> b,
                  std::unique_ptr<aggregation> const& agg,
                  scan_type inclusive,
                  null_policy null_handling,
                  numeric::scale_type scale)
   {
-    bool const do_print = false;  // set true for debugging
-
     auto col_in = this->make_column(v, b, scale);
-    std::unique_ptr<column> col_out;
-    std::unique_ptr<column> expected_col_out;
 
     if (not this->params_supported(agg, inclusive)) {
-      EXPECT_THROW(scan(*col_in, agg, inclusive, null_handling), logic_error);
+      EXPECT_THROW(scan(*col_in, agg, inclusive, null_handling), cudf::logic_error);
     } else {
-      expected_col_out = this->make_expected(v, b, agg, inclusive, null_handling, scale);
-      col_out          = scan(*col_in, agg, inclusive, null_handling);
+      auto expected_col_out = this->make_expected(v, b, agg, inclusive, null_handling, scale);
+      auto col_out          = scan(*col_in, agg, inclusive, null_handling);
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected_col_out, *col_out);
-
-      if constexpr (do_print) {
-        std::cout << "input = ";
-        print(*col_in);
-        std::cout << "expected = ";
-        print(*expected_col_out);
-        std::cout << "result = ";
-        print(*col_out);
-        std::cout << std::endl;
-      }
     }
   }
 
   // Overload to iterate the test over a few different scales for fixed-point tests
-  void scan_test(host_span<HostType const> v,
-                 host_span<bool const> b,
+  void scan_test(cudf::host_span<HostType const> v,
+                 cudf::host_span<bool const> b,
                  std::unique_ptr<aggregation> const& agg,
                  scan_type inclusive,
                  null_policy null_handling = null_policy::EXCLUDE)
   {
-    if constexpr (is_fixed_point<T>()) {
+    if constexpr (cudf::is_fixed_point<T>()) {
       for (auto scale : {0, -1, -2, -3}) {
         scan_test(v, b, agg, inclusive, null_handling, numeric::scale_type{scale});
       }
@@ -147,49 +74,28 @@ struct ScanTest : public BaseFixture {
 
   bool params_supported(std::unique_ptr<aggregation> const& agg, scan_type inclusive)
   {
-    if constexpr (std::is_same_v<T, string_view>) {
-      bool supported_agg = (agg->kind == aggregation::MIN || agg->kind == aggregation::MAX ||
-                            agg->kind == aggregation::RANK || agg->kind == aggregation::DENSE_RANK);
-      return supported_agg && (inclusive == scan_type::INCLUSIVE);
-    } else if constexpr (is_fixed_point<T>()) {
-      bool supported_agg = (agg->kind == aggregation::MIN || agg->kind == aggregation::MAX ||
-                            agg->kind == aggregation::SUM || agg->kind == aggregation::RANK ||
-                            agg->kind == aggregation::DENSE_RANK);
-      return supported_agg;
-    } else if constexpr (std::is_arithmetic<T>()) {
-      bool supported_agg = (agg->kind == aggregation::MIN || agg->kind == aggregation::MAX ||
-                            agg->kind == aggregation::SUM || agg->kind == aggregation::PRODUCT ||
-                            agg->kind == aggregation::RANK || agg->kind == aggregation::DENSE_RANK);
-      return supported_agg;
-    } else {
+    bool supported = [&] {
+      switch (agg->kind) {
+        case aggregation::SUM: return std::is_invocable_v<cudf::DeviceSum, T, T>;
+        case aggregation::PRODUCT: return std::is_invocable_v<cudf::DeviceProduct, T, T>;
+        case aggregation::MIN: return std::is_invocable_v<cudf::DeviceMin, T, T>;
+        case aggregation::MAX: return std::is_invocable_v<cudf::DeviceMax, T, T>;
+        default: return false;
+      }
       return false;
-    }
-  }
-
-  std::unique_ptr<column> make_column(host_span<HostType const> v,
-                                      host_span<bool const> b   = {},
-                                      numeric::scale_type scale = numeric::scale_type{0})
-  {
-    if constexpr (std::is_same_v<T, string_view>) {
-      auto col = (b.size() > 0) ? strings_column_wrapper(v.begin(), v.end(), b.begin())
-                                : strings_column_wrapper(v.begin(), v.end());
-      return col.release();
-    } else if constexpr (is_fixed_point<T>()) {
-      auto col =
-        (b.size() > 0)
-          ? fixed_point_column_wrapper<typename T::rep>(v.begin(), v.end(), b.begin(), scale)
-          : fixed_point_column_wrapper<typename T::rep>(v.begin(), v.end(), scale);
-      return col.release();
-    } else {
-      auto col = (b.size() > 0) ? fixed_width_column_wrapper<T>(v.begin(), v.end(), b.begin())
-                                : fixed_width_column_wrapper<T>(v.begin(), v.end());
-      return col.release();
-    }
+    }();
+
+    // special cases for individual types
+    if constexpr (cudf::is_fixed_point<T>())
+      return supported && (agg->kind != aggregation::PRODUCT);
+    if constexpr (std::is_same_v<T, cudf::string_view> || cudf::is_timestamp<T>())
+      return supported && (inclusive == scan_type::INCLUSIVE);
+    return supported;
   }
 
   std::function<HostType(HostType, HostType)> make_agg(std::unique_ptr<aggregation> const& agg)
   {
-    if constexpr (std::is_same_v<T, string_view>) {
+    if constexpr (std::is_same_v<T, cudf::string_view>) {
       switch (agg->kind) {
         case aggregation::MIN: return [](HostType a, HostType b) { return std::min(a, b); };
         case aggregation::MAX: return [](HostType a, HostType b) { return std::max(a, b); };
@@ -214,7 +120,7 @@ struct ScanTest : public BaseFixture {
 
   HostType make_identity(std::unique_ptr<aggregation> const& agg)
   {
-    if constexpr (std::is_same_v<T, string_view>) {
+    if constexpr (std::is_same_v<T, cudf::string_view>) {
       switch (agg->kind) {
         case aggregation::MIN: return std::string{"\xF7\xBF\xBF\xBF"};
         case aggregation::MAX: return std::string{};
@@ -237,12 +143,12 @@ struct ScanTest : public BaseFixture {
     }
   }
 
-  std::unique_ptr<column> make_expected(host_span<HostType const> v,
-                                        host_span<bool const> b,
-                                        std::unique_ptr<aggregation> const& agg,
-                                        scan_type inclusive,
-                                        null_policy null_handling,
-                                        numeric::scale_type scale = numeric::scale_type{0})
+  std::unique_ptr<cudf::column> make_expected(cudf::host_span<HostType const> v,
+                                              cudf::host_span<bool const> b,
+                                              std::unique_ptr<aggregation> const& agg,
+                                              scan_type inclusive,
+                                              null_policy null_handling,
+                                              numeric::scale_type scale = numeric::scale_type{0})
   {
     auto op       = this->make_agg(agg);
     auto identity = this->make_identity(agg);
@@ -294,9 +200,10 @@ struct ScanTest : public BaseFixture {
   }
 };
 
-using TestTypes = Concat<NumericTypes, FixedPointTypes, Types<string_view>>;
+using TestTypes = cudf::test::
+  Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes, cudf::test::StringTypes>;
 
-TYPED_TEST_CASE(ScanTest, TestTypes);
+TYPED_TEST_SUITE(ScanTest, TestTypes);
 
 TYPED_TEST(ScanTest, Min)
 {
@@ -304,14 +211,14 @@ TYPED_TEST(ScanTest, Min)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 1, 1, 0, 0, 1});
 
   // no nulls
-  this->scan_test(v, {}, make_min_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, make_min_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::EXCLUSIVE);
   // skipna = true (default)
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
 TYPED_TEST(ScanTest, Max)
@@ -321,14 +228,14 @@ TYPED_TEST(ScanTest, Max)
 
   // inclusive
   // no nulls
-  this->scan_test(v, {}, make_max_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, make_max_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::EXCLUSIVE);
   // skipna = true (default)
-  this->scan_test(v, b, make_max_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_max_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
 TYPED_TEST(ScanTest, Product)
@@ -337,14 +244,18 @@ TYPED_TEST(ScanTest, Product)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1});
 
   // no nulls
-  this->scan_test(v, {}, make_product_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, make_product_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, cudf::make_product_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, cudf::make_product_aggregation(), scan_type::EXCLUSIVE);
   // skipna = true (default)
-  this->scan_test(v, b, make_product_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_product_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(
+    v, b, cudf::make_product_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(
+    v, b, cudf::make_product_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_product_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_product_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(
+    v, b, cudf::make_product_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(
+    v, b, cudf::make_product_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
 TYPED_TEST(ScanTest, Sum)
@@ -357,14 +268,14 @@ TYPED_TEST(ScanTest, Sum)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 0, 0, 1, 1, 1, 1});
 
   // no nulls
-  this->scan_test(v, {}, make_sum_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, make_sum_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, cudf::make_sum_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, cudf::make_sum_aggregation(), scan_type::EXCLUSIVE);
   // skipna = true (default)
-  this->scan_test(v, b, make_sum_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_sum_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_sum_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_sum_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_sum_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_sum_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_sum_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_sum_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
 TYPED_TEST(ScanTest, EmptyColumn)
@@ -373,11 +284,11 @@ TYPED_TEST(ScanTest, EmptyColumn)
   auto const b = thrust::host_vector<bool>{};
 
   // skipna = true (default)
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
 TYPED_TEST(ScanTest, LeadingNulls)
@@ -386,14 +297,14 @@ TYPED_TEST(ScanTest, LeadingNulls)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{0, 1, 1});
 
   // skipna = true (default)
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
   // skipna = false
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE);
 }
 
-class ScanStringsTest : public ScanTest<string_view> {
+class ScanStringsTest : public ScanTest<cudf::string_view> {
 };
 
 TEST_F(ScanStringsTest, MoreStringsMinMax)
@@ -406,289 +317,83 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
   });
   auto validity   = cudf::detail::make_counting_transform_iterator(
     0, [](auto idx) -> bool { return (idx % 23) != 22; });
-  strings_column_wrapper col(data_begin, data_begin + row_count, validity);
+  cudf::test::strings_column_wrapper col(data_begin, data_begin + row_count, validity);
 
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
   thrust::host_vector<bool> b(validity, validity + row_count);
 
-  this->scan_test(v, {}, make_min_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, make_min_aggregation(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, {}, cudf::make_min_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_min_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, make_max_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::INCLUSIVE, null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, make_max_aggregation(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
+  this->scan_test(v, {}, cudf::make_max_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, cudf::make_max_aggregation(), scan_type::EXCLUSIVE, null_policy::EXCLUDE);
 }
 
 template <typename T>
-struct TypedRankScanTest : ScanTest<T> {
-  inline void test_ungrouped_rank_scan(column_view const& input,
-                                       column_view const& expect_vals,
-                                       std::unique_ptr<aggregation> const& agg,
-                                       null_policy null_handling)
-  {
-    auto col_out = scan(input, agg, scan_type::INCLUSIVE, null_handling);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
-  }
+struct ScanChronoTest : public cudf::test::BaseFixture {
 };
 
-using RankTypes =
-  Concat<IntegralTypesNotBool, FloatingPointTypes, FixedPointTypes, ChronoTypes, StringTypes>;
-
-TYPED_TEST_CASE(TypedRankScanTest, RankTypes);
-
-TYPED_TEST(TypedRankScanTest, Rank)
-{
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
-    return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
-  }();
-  auto col = this->make_column(v);
-
-  auto const expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 2, 2, 3, 4, 4, 4, 4, 5, 6};
-  auto const expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 4, 4, 6, 7, 7, 7, 7, 11, 12};
-  this->test_ungrouped_rank_scan(
-    *col, expected_dense_vals, make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *col, expected_rank_vals, make_rank_aggregation(), null_policy::INCLUDE);
-}
-
-TYPED_TEST(TypedRankScanTest, RankWithNulls)
-{
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-120, -120, -120, -16, -16, 5, 6, 6, 6, 6, 34, 113});
-    return make_vector<TypeParam>({5, 5, 5, 6, 6, 9, 11, 11, 11, 11, 14, 34});
-  }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0});
-  auto col     = this->make_column(v, b);
-
-  auto const expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 2, 3, 4, 5, 5, 6, 6, 7, 8};
-  auto const expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 4, 5, 6, 7, 7, 9, 9, 11, 12};
-  this->test_ungrouped_rank_scan(
-    *col, expected_dense_vals, make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *col, expected_rank_vals, make_rank_aggregation(), null_policy::INCLUDE);
-}
-
-TYPED_TEST(TypedRankScanTest, mixedStructs)
-{
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
-  }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col     = this->make_column(v, b);
-  auto strings = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  std::vector<std::unique_ptr<column>> vector_of_columns;
-  vector_of_columns.push_back(std::move(col));
-  vector_of_columns.push_back(strings.release());
-  auto struct_col = structs_column_wrapper{std::move(vector_of_columns)}.release();
-
-  auto expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 8};
-  auto expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 1, 3, 3, 5, 6, 7, 7, 9, 10, 10, 12};
-
-  this->test_ungrouped_rank_scan(
-    *struct_col, expected_dense_vals, make_dense_rank_aggregation(), null_policy::INCLUDE);
-  this->test_ungrouped_rank_scan(
-    *struct_col, expected_rank_vals, make_rank_aggregation(), null_policy::INCLUDE);
-}
-
-TYPED_TEST(TypedRankScanTest, nestedStructs)
-{
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
-  }();
-  auto const b  = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col1     = this->make_column(v, b);
-  auto col2     = this->make_column(v, b);
-  auto col3     = this->make_column(v, b);
-  auto col4     = this->make_column(v, b);
-  auto strings1 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  auto strings2 = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-
-  std::vector<std::unique_ptr<column>> struct_columns;
-  struct_columns.push_back(std::move(col1));
-  struct_columns.push_back(strings1.release());
-  auto struct_col = structs_column_wrapper{std::move(struct_columns)};
-  std::vector<std::unique_ptr<column>> nested_columns;
-  nested_columns.push_back(struct_col.release());
-  nested_columns.push_back(std::move(col2));
-  auto nested_col = structs_column_wrapper{std::move(nested_columns)};
-  std::vector<std::unique_ptr<column>> flat_columns;
-  flat_columns.push_back(std::move(col3));
-  flat_columns.push_back(strings2.release());
-  flat_columns.push_back(std::move(col4));
-  auto flat_col = structs_column_wrapper{std::move(flat_columns)};
-
-  auto dense_out =
-    scan(nested_col, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto dense_expected =
-    scan(flat_col, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out =
-    scan(nested_col, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_expected =
-    scan(flat_col, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view());
-}
+TYPED_TEST_SUITE(ScanChronoTest, cudf::test::ChronoTypes);
 
-TYPED_TEST(TypedRankScanTest, structsWithNullPushdown)
+TYPED_TEST(ScanChronoTest, ChronoMinMax)
 {
-  auto const v = [] {
-    if (std::is_signed<TypeParam>::value)
-      return make_vector<TypeParam>({-1, -1, -4, -4, -4, 5, 7, 7, 7, 9, 9, 9});
-    return make_vector<TypeParam>({0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9});
-  }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
-  auto col     = this->make_column(v, b);
-  auto strings = strings_column_wrapper{
-    {"0a", "0a", "2a", "2a", "3b", "5", "6c", "6c", "6c", "9", "9", "10d"}, null_at(8)};
-  std::vector<std::unique_ptr<column>> struct_columns;
-  struct_columns.push_back(std::move(col));
-  struct_columns.push_back(strings.release());
-
-  auto struct_col =
-    cudf::make_structs_column(12, std::move(struct_columns), 0, rmm::device_buffer{});
-
-  struct_col->set_null_mask(create_null_mask(12, cudf::mask_state::ALL_NULL));
-  auto expected_null_result =
-    fixed_width_column_wrapper<size_type>{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  auto dense_null_out =
-    scan(*struct_col, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_null_out =
-    scan(*struct_col, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_null_out->view(), expected_null_result);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_null_out->view(), expected_null_result);
-
-  auto const struct_nulls =
-    thrust::host_vector<bool>(std::vector<bool>{1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  struct_col->set_null_mask(
-    cudf::test::detail::make_null_mask(struct_nulls.begin(), struct_nulls.end()));
-  auto expected_dense_vals =
-    fixed_width_column_wrapper<size_type>{1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9};
-  auto expected_rank_vals =
-    fixed_width_column_wrapper<size_type>{1, 2, 2, 4, 5, 6, 7, 7, 9, 10, 10, 12};
-  auto dense_out =
-    scan(*struct_col, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out =
-    scan(*struct_col, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense_vals);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank_vals);
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> col({5, 4, 6, 0, 1, 6, 5, 3},
+                                                                 {1, 1, 1, 0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> expected_min({5, 4, 4, 0, 1, 1, 1, 1},
+                                                                          {1, 1, 1, 0, 1, 1, 1, 1});
+
+  auto result = cudf::scan(col, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_min);
+
+  result = cudf::scan(
+    col, cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_min);
+
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> expected_max({5, 5, 6, 0, 6, 6, 6, 6},
+                                                                          {1, 1, 1, 0, 1, 1, 1, 1});
+  result = cudf::scan(col, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_max);
+
+  result = cudf::scan(
+    col, cudf::make_max_aggregation(), cudf::scan_type::INCLUSIVE, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_max);
+
+  EXPECT_THROW(cudf::scan(col, cudf::make_max_aggregation(), cudf::scan_type::EXCLUSIVE),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::scan(col, cudf::make_min_aggregation(), cudf::scan_type::EXCLUSIVE),
+               cudf::logic_error);
 }
 
-/* List support dependent on https://github.com/rapidsai/cudf/issues/8683
 template <typename T>
-struct ListRankScanTest : public BaseFixture {
+struct ScanDurationTest : public cudf::test::BaseFixture {
 };
 
-using ListTestTypeSet = Concat<IntegralTypesNotBool,
-                                           FloatingPointTypes,
-                                           FixedPointTypes>;
-
-TYPED_TEST_CASE(ListRankScanTest, ListTestTypeSet);
+TYPED_TEST_SUITE(ScanDurationTest, cudf::test::DurationTypes);
 
-TYPED_TEST(ListRankScanTest, ListRank)
+TYPED_TEST(ScanDurationTest, Sum)
 {
-  auto list_col = lists_column_wrapper<TypeParam>{{0, 0},
-                                                  {0, 0},
-                                                  {7, 2},
-                                                  {7, 2},
-                                                  {7, 3},
-                                                  {5, 5},
-                                                  {4, 6},
-                                                  {4, 6},
-                                                  {4, 6},
-                                                  {9, 9},
-                                                  {9, 9},
-                                                  {9, 10}};
-  fixed_width_column_wrapper<TypeParam> element1{0, 0, 4, 4, 4, 5, 7, 7, 7, 9, 9, 9};
-  fixed_width_column_wrapper<TypeParam> element2{0, 0, 2, 2, 3, 5, 6, 6, 6, 9, 9, 10};
-  auto struct_col = structs_column_wrapper{element1, element2};
-
-  auto dense_out      = scan(list_col->view(),
-                              make_dense_rank_aggregation(),
-                              scan_type::INCLUSIVE,
-                              null_policy::INCLUDE);
-  auto dense_expected = scan(
-    struct_col, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out = scan(
-    list_col->view(), make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_expected = scan(
-    struct_col, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view());
-}
-*/
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> col({5, 4, 6, 0, 1, 6, 5, 3},
+                                                                 {1, 1, 1, 0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> expected({5, 9, 15, 0, 16, 22, 27, 30},
+                                                                      {1, 1, 1, 0, 1, 1, 1, 1});
 
-struct RankScanTest : public BaseFixture {
-};
+  auto result = cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::INCLUSIVE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
-TEST(RankScanTest, BoolRank)
-{
-  fixed_width_column_wrapper<bool> vals{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34};
-  fixed_width_column_wrapper<size_type> expected_dense_vals{1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4};
-
-  auto dense_out =
-    scan(vals, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto rank_out = scan(vals, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, dense_out->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, rank_out->view());
-}
+  result = cudf::scan(
+    col, cudf::make_sum_aggregation(), cudf::scan_type::INCLUSIVE, cudf::null_policy::EXCLUDE);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
-TEST(RankScanTest, BoolRankWithNull)
-{
-  fixed_width_column_wrapper<bool> vals{{0, 0, 0, 6, 6, 9, 11, 11, 11, 11, 14, 34},
-                                        {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
-  table_view order_table{std::vector<column_view>{vals}};
-  fixed_width_column_wrapper<size_type> expected_dense_vals{1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3};
-  fixed_width_column_wrapper<size_type> expected_rank_vals{1, 1, 1, 4, 4, 4, 4, 4, 9, 9, 9, 9};
-
-  auto nullable_dense_out =
-    scan(vals, make_dense_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  auto nullable_rank_out =
-    scan(vals, make_rank_aggregation(), scan_type::INCLUSIVE, null_policy::INCLUDE);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense_vals, nullable_dense_out->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank_vals, nullable_rank_out->view());
+  EXPECT_THROW(cudf::scan(col, cudf::make_sum_aggregation(), cudf::scan_type::EXCLUSIVE),
+               cudf::logic_error);
 }
-
-TEST(RankScanTest, ExclusiveScan)
-{
-  fixed_width_column_wrapper<uint32_t> vals{3, 4, 5};
-  fixed_width_column_wrapper<uint32_t> order_col{3, 3, 1};
-  table_view order_table{std::vector<column_view>{order_col}};
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    scan(vals, make_dense_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
-    "Unsupported dense rank aggregation operator for exclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(
-    scan(vals, make_rank_aggregation(), scan_type::EXCLUSIVE, null_policy::INCLUDE),
-    "Unsupported rank aggregation operator for exclusive scan");
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/reductions/scan_tests.hpp b/cpp/tests/reductions/scan_tests.hpp
new file mode 100644
index 00000000000..b2c53cf5915
--- /dev/null
+++ b/cpp/tests/reductions/scan_tests.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/strings/string_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+struct TypeParam_to_host_type {
+  using type = T;
+};
+
+template <>
+struct TypeParam_to_host_type<cudf::string_view> {
+  using type = std::string;
+};
+
+template <>
+struct TypeParam_to_host_type<numeric::decimal32> {
+  using type = numeric::decimal32::rep;
+};
+
+template <>
+struct TypeParam_to_host_type<numeric::decimal64> {
+  using type = numeric::decimal64::rep;
+};
+
+template <typename TypeParam, typename T>
+typename std::enable_if<std::is_same_v<TypeParam, cudf::string_view>,
+                        thrust::host_vector<std::string>>::type
+make_vector(std::initializer_list<T> const& init)
+{
+  return cudf::test::make_type_param_vector<std::string, T>(init);
+}
+
+template <typename TypeParam, typename T>
+typename std::enable_if<cudf::is_fixed_point<TypeParam>(),
+                        thrust::host_vector<typename TypeParam::rep>>::type
+make_vector(std::initializer_list<T> const& init)
+{
+  return cudf::test::make_type_param_vector<typename TypeParam::rep, T>(init);
+}
+
+template <typename TypeParam, typename T>
+typename std::enable_if<not(std::is_same_v<TypeParam, cudf::string_view> ||
+                            cudf::is_fixed_point<TypeParam>()),
+                        thrust::host_vector<TypeParam>>::type
+make_vector(std::initializer_list<T> const& init)
+{
+  return cudf::test::make_type_param_vector<TypeParam, T>(init);
+}
+
+// This is the base test feature
+template <typename T>
+struct BaseScanTest : public cudf::test::BaseFixture {
+  using HostType = typename TypeParam_to_host_type<T>::type;
+
+  std::unique_ptr<cudf::column> make_column(cudf::host_span<HostType const> v,
+                                            cudf::host_span<bool const> b = {},
+                                            numeric::scale_type scale     = numeric::scale_type{0})
+  {
+    if constexpr (std::is_same_v<T, cudf::string_view>) {
+      auto col = (b.size() > 0) ? cudf::test::strings_column_wrapper(v.begin(), v.end(), b.begin())
+                                : cudf::test::strings_column_wrapper(v.begin(), v.end());
+      return col.release();
+    } else if constexpr (cudf::is_fixed_point<T>()) {
+      auto col = (b.size() > 0) ? cudf::test::fixed_point_column_wrapper<typename T::rep>(
+                                    v.begin(), v.end(), b.begin(), scale)
+                                : cudf::test::fixed_point_column_wrapper<typename T::rep>(
+                                    v.begin(), v.end(), scale);
+      return col.release();
+    } else {
+      auto col = (b.size() > 0)
+                   ? cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end(), b.begin())
+                   : cudf::test::fixed_width_column_wrapper<T>(v.begin(), v.end());
+      return col.release();
+    }
+  }
+};
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index ecc36b3af20..e315bdd9b16 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -204,7 +204,7 @@ struct ClampTestNumeric : public cudf::test::BaseFixture {
 };
 using Types = cudf::test::FixedWidthTypesWithoutFixedPoint;
 
-TYPED_TEST_CASE(ClampTestNumeric, Types);
+TYPED_TEST_SUITE(ClampTestNumeric, Types);
 
 TYPED_TEST(ClampTestNumeric, WithNoNull)
 {
@@ -292,7 +292,7 @@ template <typename T>
 struct ClampFloatTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ClampFloatTest, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(ClampFloatTest, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(ClampFloatTest, WithNANandNoNull)
 {
@@ -591,7 +591,7 @@ template <typename T>
 struct FixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTest, ZeroScale)
 {
diff --git a/cpp/tests/replace/replace_nans_tests.cpp b/cpp/tests/replace/replace_nans_tests.cpp
index 3f157e9122e..309bb3769ff 100644
--- a/cpp/tests/replace/replace_nans_tests.cpp
+++ b/cpp/tests/replace/replace_nans_tests.cpp
@@ -73,7 +73,7 @@ struct ReplaceNaNsTest : public BaseFixture {
 
 using test_types = Types<float, double>;
 
-TYPED_TEST_CASE(ReplaceNaNsTest, test_types);
+TYPED_TEST_SUITE(ReplaceNaNsTest, test_types);
 
 template <typename T>
 void ReplaceNaNsColumn(fixed_width_column_wrapper<T> input,
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 02ebf5ac977..effa026867e 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -248,7 +248,7 @@ struct ReplaceNullsTest : public cudf::test::BaseFixture {
 
 using test_types = cudf::test::NumericTypes;
 
-TYPED_TEST_CASE(ReplaceNullsTest, test_types);
+TYPED_TEST_SUITE(ReplaceNullsTest, test_types);
 
 template <typename T>
 void ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper<T> input,
@@ -378,7 +378,7 @@ template <typename T>
 struct ReplaceNullsPolicyTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ReplaceNullsPolicyTest, test_types);
+TYPED_TEST_SUITE(ReplaceNullsPolicyTest, test_types);
 
 template <typename T>
 void TestReplaceNullsWithPolicy(cudf::test::fixed_width_column_wrapper<T> input,
@@ -491,7 +491,7 @@ template <typename T>
 struct ReplaceNullsFixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ReplaceNullsFixedPointTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(ReplaceNullsFixedPointTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn)
 {
@@ -578,7 +578,7 @@ template <typename T>
 struct ReplaceNullsPolicyFixedPointTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ReplaceNullsPolicyFixedPointTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(ReplaceNullsPolicyFixedPointTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFill)
 {
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 58ef08f6052..cfafbf26dac 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -391,7 +391,7 @@ void test_replace(cudf::host_span<T const> input_column,
 
 using Types = cudf::test::NumericTypes;
 
-TYPED_TEST_CASE(ReplaceTest, Types);
+TYPED_TEST_SUITE(ReplaceTest, Types);
 
 // Simple test, replacing all even replacement_values_column
 TYPED_TEST(ReplaceTest, ReplaceEvenPosition)
@@ -544,7 +544,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointReplace)
 {
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index 073172bf7ac..48eb8919d23 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -30,7 +30,7 @@ template <typename T>
 struct TileTest : public BaseFixture {
 };
 
-TYPED_TEST_CASE(TileTest, cudf::test::AllTypes);
+TYPED_TEST_SUITE(TileTest, cudf::test::AllTypes);
 
 TYPED_TEST(TileTest, NoColumns)
 {
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index 5631c910753..ce778ec3bf2 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -48,7 +48,7 @@ using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypes,
                                         cudf::test::DurationTypes,
                                         cudf::test::FixedPointTypes>;
 
-TYPED_TEST_CASE(TypedCollectListTest, TypesForTest);
+TYPED_TEST_SUITE(TypedCollectListTest, TypesForTest);
 
 TYPED_TEST(TypedCollectListTest, BasicRollingWindow)
 {
@@ -1296,7 +1296,7 @@ using TypesForSetTest = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                            cudf::test::DurationTypes,
                                            cudf::test::FixedPointTypes>;
 
-TYPED_TEST_CASE(TypedCollectSetTest, TypesForSetTest);
+TYPED_TEST_SUITE(TypedCollectSetTest, TypesForSetTest);
 
 TYPED_TEST(TypedCollectSetTest, BasicRollingWindow)
 {
diff --git a/cpp/tests/rolling/empty_input_test.cpp b/cpp/tests/rolling/empty_input_test.cpp
index 3296f9d32f9..0f05c675aae 100644
--- a/cpp/tests/rolling/empty_input_test.cpp
+++ b/cpp/tests/rolling/empty_input_test.cpp
@@ -78,7 +78,7 @@ template <typename T>
 struct TypedRollingEmptyInputTest : RollingEmptyInputTest {
 };
 
-TYPED_TEST_CASE(TypedRollingEmptyInputTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(TypedRollingEmptyInputTest, cudf::test::FixedWidthTypes);
 
 using cudf::rolling_aggregation;
 using agg_vector_t = std::vector<std::unique_ptr<rolling_aggregation>>;
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index 0bd88c78200..545f395fcdb 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -532,7 +532,7 @@ TEST_F(GroupedRollingErrorTest, SumTimestampNotSupported)
     cudf::logic_error);
 }
 
-TYPED_TEST_CASE(GroupedRollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(GroupedRollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(GroupedRollingTest, SimplePartitionedStaticWindowsWithGroupKeys)
 {
@@ -1110,7 +1110,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(GroupedTimeRangeRollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(GroupedTimeRangeRollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(GroupedTimeRangeRollingTest,
            SimplePartitionedStaticWindowsWithGroupKeysAndTimeRangesAscending)
@@ -1227,7 +1227,7 @@ struct TypedNullTimestampTestForRangeQueries : public cudf::test::BaseFixture {
 struct NullTimestampTestForRangeQueries : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedNullTimestampTestForRangeQueries, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(TypedNullTimestampTestForRangeQueries, cudf::test::IntegralTypes);
 
 TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNullsFirst)
 {
@@ -1541,7 +1541,7 @@ using FixedWidthTypes = cudf::test::Concat<cudf::test::IntegralTypes,
                                            cudf::test::DurationTypes,
                                            cudf::test::TimestampTypes>;
 
-TYPED_TEST_CASE(TypedUnboundedWindowTest, FixedWidthTypes);
+TYPED_TEST_SUITE(TypedUnboundedWindowTest, FixedWidthTypes);
 
 TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestampASCNullsFirst)
 {
diff --git a/cpp/tests/rolling/lead_lag_test.cpp b/cpp/tests/rolling/lead_lag_test.cpp
index 4af7bf69212..fa1d13bcb86 100644
--- a/cpp/tests/rolling/lead_lag_test.cpp
+++ b/cpp/tests/rolling/lead_lag_test.cpp
@@ -59,7 +59,7 @@ using TypesForTest = cudf::test::Concat<cudf::test::IntegralTypes,
                                         cudf::test::DurationTypes,
                                         cudf::test::TimestampTypes>;
 
-TYPED_TEST_CASE(TypedLeadLagWindowTest, TypesForTest);
+TYPED_TEST_SUITE(TypedLeadLagWindowTest, TypesForTest);
 
 TYPED_TEST(TypedLeadLagWindowTest, LeadLagBasics)
 {
@@ -536,7 +536,7 @@ template <typename T>
 struct TypedNestedLeadLagWindowTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedNestedLeadLagWindowTest, TypesForTest);
+TYPED_TEST_SUITE(TypedNestedLeadLagWindowTest, TypesForTest);
 
 TYPED_TEST(TypedNestedLeadLagWindowTest, NumericListsWithNullsAllOver)
 {
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index 03bb7a80a37..8d92bf56180 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -101,7 +101,7 @@ template <typename T>
 struct TypedTimeRangeRollingTest : RangeRollingTest {
 };
 
-TYPED_TEST_CASE(TypedTimeRangeRollingTest, cudf::test::TimestampTypes);
+TYPED_TEST_SUITE(TypedTimeRangeRollingTest, cudf::test::TimestampTypes);
 
 template <typename WindowExecT>
 void verify_results_for_ascending(WindowExecT exec)
@@ -265,7 +265,7 @@ template <typename T>
 struct TypedIntegralRangeRollingTest : RangeRollingTest {
 };
 
-TYPED_TEST_CASE(TypedIntegralRangeRollingTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(TypedIntegralRangeRollingTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(TypedIntegralRangeRollingTest, OrderByASC)
 {
@@ -321,7 +321,7 @@ struct TypedRangeRollingNullsTest : public RangeRollingTest {
 
 using TypesUnderTest = IntegralTypesNotBool;
 
-TYPED_TEST_CASE(TypedRangeRollingNullsTest, TypesUnderTest);
+TYPED_TEST_SUITE(TypedRangeRollingNullsTest, TypesUnderTest);
 
 template <typename T>
 auto do_count_over_window(
diff --git a/cpp/tests/rolling/range_window_bounds_test.cpp b/cpp/tests/rolling/range_window_bounds_test.cpp
index 00f7637ac29..451f8b054ce 100644
--- a/cpp/tests/rolling/range_window_bounds_test.cpp
+++ b/cpp/tests/rolling/range_window_bounds_test.cpp
@@ -40,7 +40,7 @@ template <typename Timestamp>
 struct TimestampRangeWindowBoundsTest : RangeWindowBoundsTest {
 };
 
-TYPED_TEST_CASE(TimestampRangeWindowBoundsTest, cudf::test::TimestampTypes);
+TYPED_TEST_SUITE(TimestampRangeWindowBoundsTest, cudf::test::TimestampTypes);
 
 TEST_F(RangeWindowBoundsTest, TestBasicTimestampRangeTypeMapping)
 {
@@ -103,7 +103,7 @@ struct NumericRangeWindowBoundsTest : RangeWindowBoundsTest {
 
 using TypesForTest = cudf::test::IntegralTypesNotBool;
 
-TYPED_TEST_CASE(NumericRangeWindowBoundsTest, TypesForTest);
+TYPED_TEST_SUITE(NumericRangeWindowBoundsTest, TypesForTest);
 
 TYPED_TEST(NumericRangeWindowBoundsTest, BasicNumericRangeTypeMapping)
 {
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index a83e5886df5..7d1645faba9 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -406,7 +406,7 @@ template <typename T>
 class RollingVarStdTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(RollingVarStdTest, cudf::test::FixedWidthTypesWithoutChrono);
+TYPED_TEST_SUITE(RollingVarStdTest, cudf::test::FixedWidthTypesWithoutChrono);
 
 class RollingtVarStdTestUntyped : public cudf::test::BaseFixture {
 };
@@ -599,7 +599,7 @@ TEST_F(RollingErrorTest, MeanTimestampNotSupported)
                cudf::logic_error);
 }
 
-TYPED_TEST_CASE(RollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(RollingTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 // simple example from Pandas docs
 TYPED_TEST(RollingTest, SimpleStatic)
@@ -1142,7 +1142,7 @@ template <typename T>
 struct FixedPointTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTests, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTests, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTests, MinMaxCountLagLead)
 {
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 825703274e2..1a9302a3e7e 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -40,9 +40,9 @@ struct RoundTestsFloatingPointTypes : public cudf::test::BaseFixture {
 
 using IntegerTypes = cudf::test::Types<int16_t, int32_t, int64_t>;
 
-TYPED_TEST_CASE(RoundTestsIntegerTypes, IntegerTypes);
-TYPED_TEST_CASE(RoundTestsFixedPointTypes, cudf::test::FixedPointTypes);
-TYPED_TEST_CASE(RoundTestsFloatingPointTypes, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(RoundTestsIntegerTypes, IntegerTypes);
+TYPED_TEST_SUITE(RoundTestsFixedPointTypes, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(RoundTestsFloatingPointTypes, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(RoundTestsFixedPointTypes, SimpleFixedPointTestHalfUpZero)
 {
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index e2f2c26a16e..3e89e435bc0 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -35,7 +35,7 @@ struct NumericScalarFactory : public ScalarFactoryTest {
   static constexpr auto factory = cudf::make_numeric_scalar;
 };
 
-TYPED_TEST_CASE(NumericScalarFactory, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(NumericScalarFactory, cudf::test::NumericTypes);
 
 TYPED_TEST(NumericScalarFactory, FactoryDefault)
 {
@@ -65,7 +65,7 @@ struct TimestampScalarFactory : public ScalarFactoryTest {
   static constexpr auto factory = cudf::make_timestamp_scalar;
 };
 
-TYPED_TEST_CASE(TimestampScalarFactory, cudf::test::TimestampTypes);
+TYPED_TEST_SUITE(TimestampScalarFactory, cudf::test::TimestampTypes);
 
 TYPED_TEST(TimestampScalarFactory, FactoryDefault)
 {
@@ -96,7 +96,7 @@ struct DefaultScalarFactory : public ScalarFactoryTest {
 };
 
 using MixedTypes = cudf::test::Concat<cudf::test::AllTypes, cudf::test::StringTypes>;
-TYPED_TEST_CASE(DefaultScalarFactory, MixedTypes);
+TYPED_TEST_SUITE(DefaultScalarFactory, MixedTypes);
 
 TYPED_TEST(DefaultScalarFactory, FactoryDefault)
 {
@@ -123,7 +123,7 @@ template <typename T>
 struct FixedWidthScalarFactory : public ScalarFactoryTest {
 };
 
-TYPED_TEST_CASE(FixedWidthScalarFactory, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(FixedWidthScalarFactory, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(FixedWidthScalarFactory, ValueProvided)
 {
@@ -144,7 +144,7 @@ template <typename T>
 struct FixedPointScalarFactory : public ScalarFactoryTest {
 };
 
-TYPED_TEST_CASE(FixedPointScalarFactory, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointScalarFactory, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointScalarFactory, ValueProvided)
 {
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 19d5372d93a..984e591d19f 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -33,7 +33,7 @@ template <typename T>
 struct TypedScalarDeviceViewTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedScalarDeviceViewTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(TypedScalarDeviceViewTest, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 template <typename ScalarDeviceViewType>
 __global__ void test_set_value(ScalarDeviceViewType s, ScalarDeviceViewType s1)
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index b54594fd1c4..45646259cce 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -29,8 +29,8 @@ template <typename T>
 struct TypedScalarTestWithoutFixedPoint : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TypedScalarTest, cudf::test::FixedWidthTypes);
-TYPED_TEST_CASE(TypedScalarTestWithoutFixedPoint, cudf::test::FixedWidthTypesWithoutFixedPoint);
+TYPED_TEST_SUITE(TypedScalarTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(TypedScalarTestWithoutFixedPoint, cudf::test::FixedWidthTypesWithoutFixedPoint);
 
 TYPED_TEST(TypedScalarTest, DefaultValidity)
 {
diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp
index 8738a9195a5..db2ecb89d6a 100644
--- a/cpp/tests/search/search_struct_test.cpp
+++ b/cpp/tests/search/search_struct_test.cpp
@@ -44,7 +44,7 @@ using TestTypes = cudf::test::Concat<cudf::test::IntegralTypesNotBool,
                                      cudf::test::DurationTypes,
                                      cudf::test::TimestampTypes>;
 
-TYPED_TEST_CASE(TypedStructSearchTest, TestTypes);
+TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes);
 
 namespace {
 auto search_bounds(cudf::column_view const& t_col_view,
diff --git a/cpp/tests/search/search_test.cpp b/cpp/tests/search/search_test.cpp
index 38fc5abb250..79d992005d8 100644
--- a/cpp/tests/search/search_test.cpp
+++ b/cpp/tests/search/search_test.cpp
@@ -1820,7 +1820,7 @@ template <typename T>
 struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointLowerBound)
 {
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 58ebc922f2a..f2d82d6ecb4 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -239,7 +239,7 @@ struct IsSortedTest : public BaseFixture {
 };
 
 using SupportedTypes = Concat<ComparableTypes, cudf::test::Types<cudf::struct_view>>;
-TYPED_TEST_CASE(IsSortedTest, SupportedTypes);
+TYPED_TEST_SUITE(IsSortedTest, SupportedTypes);
 
 TYPED_TEST(IsSortedTest, NoColumns)
 {
@@ -449,6 +449,6 @@ template <typename T>
 struct IsSortedFixedWidthOnly : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(IsSortedFixedWidthOnly, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(IsSortedFixedWidthOnly, cudf::test::FixedWidthTypes);
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 7ae4ad5080e..94e389fc7ce 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -103,7 +103,7 @@ struct Rank : public BaseFixture {
   }
 };
 
-TYPED_TEST_CASE(Rank, NumericTypes);
+TYPED_TEST_SUITE(Rank, NumericTypes);
 
 // fixed_width_column_wrapper<T>   col1{{  5,   4,   3,   5,   8,   5}};
 //                                        3,   2,   1,   4,   6,   5
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index 3b70f9f9d1d..fb07bfde795 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -38,7 +38,7 @@ template <typename T>
 struct SegmentedSort : public BaseFixture {
 };
 
-TYPED_TEST_CASE(SegmentedSort, NumericTypes);
+TYPED_TEST_SUITE(SegmentedSort, NumericTypes);
 using SegmentedSortInt = SegmentedSort<int>;
 
 /* Summary of test cases.
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 6e668068f94..54cd97301a8 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -59,7 +59,7 @@ template <typename T>
 struct Sort : public BaseFixture {
 };
 
-TYPED_TEST_CASE(Sort, TestTypes);
+TYPED_TEST_SUITE(Sort, TestTypes);
 
 TYPED_TEST(Sort, WithNullMax)
 {
@@ -678,7 +678,7 @@ struct FixedPointTestBothReps : public cudf::test::BaseFixture {
 
 template <typename T>
 using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_CASE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTestBothReps, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTestBothReps, FixedPointSortedOrderGather)
 {
diff --git a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
index 373cd50fb1f..916d2a33b97 100644
--- a/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_duplicates_tests.cpp
@@ -35,7 +35,7 @@ template <typename T>
 struct DistinctCountCommon : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(DistinctCountCommon, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(DistinctCountCommon, cudf::test::NumericTypes);
 
 TYPED_TEST(DistinctCountCommon, NoNull)
 {
diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
index 74c61e720ed..5fee95444f3 100644
--- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
@@ -209,7 +209,7 @@ template <typename T>
 struct DropNullsTestAll : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(DropNullsTestAll, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(DropNullsTestAll, cudf::test::NumericTypes);
 
 TYPED_TEST(DropNullsTestAll, AllNull)
 {
diff --git a/cpp/tests/strings/array_tests.cu b/cpp/tests/strings/array_tests.cpp
similarity index 61%
rename from cpp/tests/strings/array_tests.cu
rename to cpp/tests/strings/array_tests.cpp
index a4d8ecb2bec..2a13abfacfb 100644
--- a/cpp/tests/strings/array_tests.cu
+++ b/cpp/tests/strings/array_tests.cpp
@@ -15,25 +15,17 @@
  */
 
 #include <tests/strings/utilities.h>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/strings/detail/copying.hpp>
-#include <cudf/strings/detail/scatter.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -120,7 +112,7 @@ TEST_P(SliceParmsTest, SliceAllEmpty)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(SliceParms,
+INSTANTIATE_TEST_CASE_P(StringsColumnTest,
                         SliceParmsTest,
                         testing::ValuesIn(std::array<cudf::size_type, 3>{5, 6, 7}));
 
@@ -161,98 +153,78 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
   cudf::test::expect_strings_empty(results.front()->view());
 }
 
-struct column_to_string_view_vector {
-  cudf::column_device_view const d_strings;
-  __device__ cudf::string_view operator()(cudf::size_type idx) const
-  {
-    cudf::string_view d_str{nullptr, 0};
-    if (d_strings.is_valid(idx)) d_str = d_strings.element<cudf::string_view>(idx);
-    return d_str;
-  }
-};
-
 TEST_F(StringsColumnTest, GatherTooBig)
 {
-  cudf::test::strings_column_wrapper strings({"0123456789012345678901234567890123456789"});
+  std::vector<int8_t> h_chars(3000000);
+  cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets({0, 3000000});
+  auto input = cudf::column_view(
+    cudf::data_type{cudf::type_id::STRING}, 1, nullptr, nullptr, 0, 0, {offsets, chars});
   auto map = thrust::constant_iterator<int8_t>(0);
-  cudf::test::fixed_width_column_wrapper<int8_t> gather_map(
-    map, map + std::numeric_limits<cudf::size_type>::max() / 20);
-  EXPECT_THROW(cudf::gather(cudf::table_view{{strings}}, gather_map), cudf::logic_error);
+  cudf::test::fixed_width_column_wrapper<int8_t> gather_map(map, map + 1000);
+  EXPECT_THROW(cudf::gather(cudf::table_view{{input}}, gather_map), cudf::logic_error);
 }
 
 TEST_F(StringsColumnTest, Scatter)
 {
-  std::vector<const char*> h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings1(
-    h_strings1.begin(),
-    h_strings1.end(),
-    thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; }));
-  auto target = cudf::strings_column_view(strings1);
-  std::vector<const char*> h_strings2{"1", "22"};
-  cudf::test::strings_column_wrapper strings2(
-    h_strings2.begin(),
-    h_strings2.end(),
-    thrust::make_transform_iterator(h_strings2.begin(), [](auto str) { return str != nullptr; }));
-  auto source = cudf::strings_column_view(strings2);
-
-  std::vector<int32_t> h_scatter_map({4, 1});
-  auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map);
-
-  auto source_column = cudf::column_device_view::create(source.parent());
-  auto begin =
-    cudf::detail::make_counting_transform_iterator(0, column_to_string_view_vector{*source_column});
+  cudf::test::strings_column_wrapper target({"eee", "bb", "", "", "aa", "bbb", "ééé"},
+                                            {1, 1, 0, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper source({"1", "22"});
 
-  auto results =
-    cudf::strings::detail::scatter(begin, begin + source.size(), scatter_map.begin(), target);
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({4, 1});
 
-  std::vector<const char*> h_expected{"eee", "22", nullptr, "", "1", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target}));
+
+  cudf::test::strings_column_wrapper expected({"eee", "22", "", "", "1", "bbb", "ééé"},
+                                              {1, 1, 0, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
 }
 
 TEST_F(StringsColumnTest, ScatterScalar)
 {
-  std::vector<const char*> h_strings1{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings1(
-    h_strings1.begin(),
-    h_strings1.end(),
-    thrust::make_transform_iterator(h_strings1.begin(), [](auto str) { return str != nullptr; }));
-  auto target = cudf::strings_column_view(strings1);
+  cudf::test::strings_column_wrapper target({"eee", "bb", "", "", "aa", "bbb", "ééé"},
+                                            {1, 1, 0, 1, 1, 1, 1});
 
-  std::vector<int32_t> h_scatter_map({0, 5});
-  auto scatter_map = cudf::detail::make_device_uvector_sync(h_scatter_map);
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({0, 5});
 
   cudf::string_scalar scalar("__");
-  auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size()));
-
-  auto results =
-    cudf::strings::detail::scatter(begin, begin + scatter_map.size(), scatter_map.begin(), target);
+  auto source  = std::vector<std::reference_wrapper<const cudf::scalar>>({scalar});
+  auto results = cudf::scatter(source, scatter_map, cudf::table_view({target}));
 
-  std::vector<const char*> h_expected{"__", "bb", nullptr, "", "aa", "__", "ééé"};
-  cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  cudf::test::strings_column_wrapper expected({"__", "bb", "", "", "aa", "__", "ééé"},
+                                              {1, 1, 0, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
 }
 
 TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn)
 {
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto source = cudf::strings_column_view(zero_size_strings_column);
-  cudf::column_view values(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto target = cudf::strings_column_view(values);
+  cudf::column_view source(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  cudf::column_view target(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  cudf::column_view scatter_map(cudf::data_type{cudf::type_id::INT8}, 0, nullptr, nullptr, 0);
+
+  auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target}));
+  cudf::test::expect_strings_empty(results->view().column(0));
 
-  rmm::device_uvector<int32_t> scatter_map(0, rmm::cuda_stream_default);
   cudf::string_scalar scalar("");
-  auto begin = thrust::make_constant_iterator(cudf::string_view(scalar.data(), scalar.size()));
+  auto scalar_source = std::vector<std::reference_wrapper<const cudf::scalar>>({scalar});
+  results            = cudf::scatter(scalar_source, scatter_map, cudf::table_view({target}));
+  cudf::test::expect_strings_empty(results->view().column(0));
+}
 
-  auto results = cudf::strings::detail::scatter(begin, begin, scatter_map.begin(), target);
-  cudf::test::expect_strings_empty(results->view());
+TEST_F(StringsColumnTest, OffsetsBeginEnd)
+{
+  cudf::test::strings_column_wrapper input({"eee", "bb", "", "", "aa", "bbb", "ééé"},
+                                           {1, 1, 0, 1, 1, 1, 1});
+
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 5});
+  auto scv = cudf::strings_column_view(input);
+  EXPECT_EQ(std::distance(scv.offsets_begin(), scv.offsets_end()),
+            static_cast<std::ptrdiff_t>(scv.size() + 1));
+
+  scv = cudf::strings_column_view(cudf::slice(input, {1, 5}).front());
+  EXPECT_EQ(std::distance(scv.offsets_begin(), scv.offsets_end()),
+            static_cast<std::ptrdiff_t>(scv.size() + 1));
+  EXPECT_EQ(std::distance(scv.chars_begin(), scv.chars_end()), 16L);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index da55e967266..a9d4c9c76b5 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -149,6 +149,32 @@ TEST_F(StringsCaseTest, Title)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
+TEST_F(StringsCaseTest, IsTitle)
+{
+  cudf::test::strings_column_wrapper input({"Sⱥⱥnich",
+                                            "Examples Abc",
+                                            "Thesé Strings",
+                                            "",
+                                            "Are The",
+                                            "Tést strings",
+                                            "",
+                                            "N2Vidia Corp",
+                                            "SNAKE",
+                                            "!Abc",
+                                            " Eagle",
+                                            "A Test",
+                                            "12345",
+                                            "Alpha Not Upper Or Lower: ƻC",
+                                            "one More"},
+                                           {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto results = cudf::strings::is_title(cudf::strings_column_view(input));
+
+  cudf::test::fixed_width_column_wrapper<bool> expected(
+    {1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsCaseTest, MultiCharUpper)
 {
   cudf::test::strings_column_wrapper strings{"\u1f52 \u1f83", "\u1e98 \ufb05", "\u0149"};
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index 17e08bd21c5..ff9f79ea87f 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -28,12 +28,11 @@
 struct StringsCharsTest : public cudf::test::BaseFixture {
 };
 
-class StringsCharsTestTypes
-  : public StringsCharsTest,
-    public testing::WithParamInterface<cudf::strings::string_character_types> {
+class CharsTypes : public StringsCharsTest,
+                   public testing::WithParamInterface<cudf::strings::string_character_types> {
 };
 
-TEST_P(StringsCharsTestTypes, AllTypes)
+TEST_P(CharsTypes, AllTypes)
 {
   std::vector<const char*> h_strings{"Héllo",
                                      "thesé",
@@ -84,8 +83,8 @@ TEST_P(StringsCharsTestTypes, AllTypes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(StringsCharsTestAllTypes,
-                        StringsCharsTestTypes,
+INSTANTIATE_TEST_CASE_P(StringsCharsTest,
+                        CharsTypes,
                         testing::ValuesIn(std::array<cudf::strings::string_character_types, 7>{
                           cudf::strings::string_character_types::DECIMAL,
                           cudf::strings::string_character_types::NUMERIC,
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index ddd6fc9e1dc..3c11444e4b5 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <algorithm>
 #include <vector>
 
 struct StringsContainsTests : public cudf::test::BaseFixture {
@@ -236,6 +237,30 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
   }
 }
 
+TEST_F(StringsContainsTests, EmbeddedNullCharacter)
+{
+  std::vector<std::string> data(10);
+  std::generate(data.begin(), data.end(), [n = 0]() mutable {
+    char first      = static_cast<char>('A' + n++);
+    char raw_data[] = {first, '\0', 'B'};
+    return std::string{raw_data, 3};
+  });
+  cudf::test::strings_column_wrapper input(data.begin(), data.end());
+  auto strings_view = cudf::strings_column_view(input);
+
+  auto results  = cudf::strings::contains_re(strings_view, "A");
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  results  = cudf::strings::contains_re(strings_view, "B");
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  results  = cudf::strings::contains_re(strings_view, "J\\0B");
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 0, 0, 0, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsContainsTests, CountTest)
 {
   std::vector<const char*> h_strings{
@@ -275,6 +300,66 @@ TEST_F(StringsContainsTests, CountTest)
   }
 }
 
+TEST_F(StringsContainsTests, MultiLine)
+{
+  auto input = cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, "^abc$");
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+
+  results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, "^abc$");
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+
+  results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, "^abc$");
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+}
+
+TEST_F(StringsContainsTests, DotAll)
+{
+  auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
+  auto view  = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::contains_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, "a.*f");
+  expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+
+  results = cudf::strings::matches_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, "a.*f");
+  expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+
+  results             = cudf::strings::count_re(view, "a.*?f", cudf::strings::regex_flags::DOTALL);
+  auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, "a.*?f");
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+
+  auto both_flags = cudf::strings::regex_flags::DOTALL | cudf::strings::regex_flags::MULTILINE;
+  results =
+    cudf::strings::count_re(view, "a.*?f", static_cast<cudf::strings::regex_flags>(both_flags));
+  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+}
+
 TEST_F(StringsContainsTests, MediumRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 820bf5ec216..b614b3b49fe 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -32,7 +32,7 @@ template <typename T>
 class StringsFixedPointConvertTest : public StringsConvertTest {
 };
 
-TYPED_TEST_CASE(StringsFixedPointConvertTest, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(StringsFixedPointConvertTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint)
 {
diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp
new file mode 100644
index 00000000000..63fcdf6f00e
--- /dev/null
+++ b/cpp/tests/strings/format_lists_tests.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <tests/strings/utilities.h>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/convert/convert_lists.hpp>
+
+struct StringsFormatListsTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(StringsFormatListsTest, EmptyList)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input = STR_LISTS{};
+  auto const view  = cudf::lists_column_view(input);
+
+  auto results = cudf::strings::format_list_column(view);
+  cudf::test::expect_strings_empty(results->view());
+}
+
+TEST_F(StringsFormatListsTest, EmptyNestedList)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input = STR_LISTS{STR_LISTS{STR_LISTS{}, STR_LISTS{}}, STR_LISTS{STR_LISTS{}}};
+  auto const view  = cudf::lists_column_view(input);
+
+  auto results  = cudf::strings::format_list_column(view);
+  auto expected = cudf::test::strings_column_wrapper({"[[],[]]", "[[]]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsFormatListsTest, WithNulls)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input = STR_LISTS{{STR_LISTS{{"a", "", "ccc"}, cudf::test::iterators::null_at(1)},
+                                STR_LISTS{},
+                                STR_LISTS{{"", "bb", "ddd"}, cudf::test::iterators::null_at(0)},
+                                STR_LISTS{"zzz", "xxxxx"},
+                                STR_LISTS{{"v", "", "", "w"}, cudf::test::iterators::null_at(2)}},
+                               cudf::test::iterators::null_at(1)};
+  auto const view  = cudf::lists_column_view(input);
+
+  auto results  = cudf::strings::format_list_column(view);
+  auto expected = cudf::test::strings_column_wrapper(
+    {"[a,NULL,ccc]", "NULL", "[NULL,bb,ddd]", "[zzz,xxxxx]", "[v,,NULL,w]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsFormatListsTest, CustomParameters)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input =
+    STR_LISTS{STR_LISTS{{STR_LISTS{{"a", "", "ccc"}, cudf::test::iterators::null_at(1)},
+                         STR_LISTS{},
+                         STR_LISTS{"ddd", "ee", "f"}},
+                        cudf::test::iterators::null_at(1)},
+              {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}};
+  auto const view = cudf::lists_column_view(input);
+  auto separators = cudf::test::strings_column_wrapper({": ", "{", "}"});
+
+  auto results = cudf::strings::format_list_column(
+    view, cudf::string_scalar("null"), cudf::strings_column_view(separators));
+  auto expected = cudf::test::strings_column_wrapper(
+    {"{{a: null: ccc}: null: {ddd: ee: f}}", "{{gg: hhh}: {i: : : jj}}"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsFormatListsTest, NestedList)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input =
+    STR_LISTS{{STR_LISTS{"a", "bb", "ccc"}, STR_LISTS{}, STR_LISTS{"ddd", "ee", "f"}},
+              {STR_LISTS{"gg", "hhh"}, STR_LISTS{"i", "", "", "jj"}}};
+  auto const view = cudf::lists_column_view(input);
+
+  auto results = cudf::strings::format_list_column(view);
+  auto expected =
+    cudf::test::strings_column_wrapper({"[[a,bb,ccc],[],[ddd,ee,f]]", "[[gg,hhh],[i,,,jj]]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(StringsFormatListsTest, SlicedLists)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  auto const input =
+    STR_LISTS{{STR_LISTS{{"a", "", "bb"}, cudf::test::iterators::null_at(1)},
+               STR_LISTS{},
+               STR_LISTS{{"", "ccc", "dddd"}, cudf::test::iterators::null_at(0)},
+               STR_LISTS{"zzz", ""},
+               STR_LISTS{},
+               STR_LISTS{{"abcdef", "012345", "", ""}, cudf::test::iterators::null_at(2)},
+               STR_LISTS{{"", "11111", "00000"}, cudf::test::iterators::null_at(0)},
+               STR_LISTS{"hey hey", "way way"},
+               STR_LISTS{},
+               STR_LISTS{"ééé", "12345abcdef"},
+               STR_LISTS{"www", "12345"}},
+              cudf::test::iterators::nulls_at({1, 4, 8})};
+
+  // matching expected strings
+  auto const h_expected = std::vector<std::string>({"[a,NULL,bb]",
+                                                    "NULL",
+                                                    "[NULL,ccc,dddd]",
+                                                    "[zzz,]",
+                                                    "NULL",
+                                                    "[abcdef,012345,NULL,]",
+                                                    "[NULL,11111,00000]",
+                                                    "[hey hey,way way]",
+                                                    "NULL",
+                                                    "[ééé,12345abcdef]",
+                                                    "[www,12345]"});
+
+  // set of slice intervals: covers slicing the front, back, and middle
+  std::vector<std::pair<int32_t, int32_t>> index_pairs({{0, 11}, {0, 4}, {3, 8}, {5, 11}});
+  for (auto indexes : index_pairs) {
+    auto sliced   = cudf::lists_column_view(cudf::slice(input, {indexes.first, indexes.second})[0]);
+    auto results  = cudf::strings::format_list_column(sliced);
+    auto expected = cudf::test::strings_column_wrapper(h_expected.begin() + indexes.first,
+                                                       h_expected.begin() + indexes.second);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(StringsFormatListsTest, Errors)
+{
+  using STR_LISTS = cudf::test::lists_column_wrapper<cudf::string_view>;
+
+  cudf::test::lists_column_wrapper<int32_t> invalid({1, 2, 3});
+  EXPECT_THROW(cudf::strings::format_list_column(cudf::lists_column_view(invalid)),
+               cudf::logic_error);
+
+  auto const input = STR_LISTS{STR_LISTS{}, STR_LISTS{}};
+  auto const view  = cudf::lists_column_view(input);
+  auto separators  = cudf::test::strings_column_wrapper({"{", "}"});
+
+  EXPECT_THROW(cudf::strings::format_list_column(
+                 view, cudf::string_scalar(""), cudf::strings_column_view(separators)),
+               cudf::logic_error);
+
+  EXPECT_THROW(cudf::strings::format_list_column(view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+}
diff --git a/cpp/tests/strings/hash_string.cu b/cpp/tests/strings/hash_string.cu
deleted file mode 100644
index b5298d39bda..00000000000
--- a/cpp/tests/strings/hash_string.cu
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "./utilities.h"
-#include "rmm/exec_policy.hpp"
-
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/hash_functions.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/transform.h>
-#include <vector>
-
-struct StringsHashTest : public cudf::test::BaseFixture {
-};
-
-struct hash_string_fn {
-  cudf::column_device_view d_strings;
-  uint32_t __device__ operator()(uint32_t idx)
-  {
-    if (d_strings.is_null(idx)) return 0;
-    auto item = d_strings.element<cudf::string_view>(idx);
-    return MurmurHash3_32<cudf::string_view>{}(item);
-  }
-};
-
-TEST_F(StringsHashTest, HashTest)
-{
-  std::vector<const char*> h_strings{"abcdefghijklmnopqrstuvwxyz",
-                                     "abcdefghijklmnopqrstuvwxyz",
-                                     "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
-                                     "0123456789",
-                                     "4",
-                                     "",
-                                     nullptr,
-                                     "last one"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  auto strings_view   = cudf::strings_column_view(strings);
-  auto strings_column = cudf::column_device_view::create(strings_view.parent());
-  auto d_view         = *strings_column;
-
-  rmm::device_uvector<uint32_t> d_values(strings_view.size(), rmm::cuda_stream_default);
-  thrust::transform(rmm::exec_policy(),
-                    thrust::make_counting_iterator<uint32_t>(0),
-                    thrust::make_counting_iterator<uint32_t>(strings_view.size()),
-                    d_values.begin(),
-                    hash_string_fn{d_view});
-
-  uint32_t h_expected[] = {
-    2739798893, 2739798893, 3506676360, 1891213601, 3778137224, 0, 0, 1551088011};
-  auto h_values = cudf::detail::make_host_vector_sync(d_values);
-  for (uint32_t idx = 0; idx < h_values.size(); ++idx)
-    EXPECT_EQ(h_values[idx], h_expected[idx]);
-}
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 25459b17d6f..a9f9eacede4 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -287,7 +287,7 @@ template <typename T>
 class StringsIntegerConvertTest : public StringsConvertTest {
 };
 
-TYPED_TEST_CASE(StringsIntegerConvertTest, cudf::test::IntegralTypesNotBool);
+TYPED_TEST_SUITE(StringsIntegerConvertTest, cudf::test::IntegralTypesNotBool);
 
 TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
 {
@@ -330,7 +330,7 @@ class StringsFloatConvertTest : public StringsConvertTest {
 };
 
 using FloatTypes = cudf::test::Types<float, double>;
-TYPED_TEST_CASE(StringsFloatConvertTest, FloatTypes);
+TYPED_TEST_SUITE(StringsFloatConvertTest, FloatTypes);
 
 TYPED_TEST(StringsFloatConvertTest, FromToIntegerError)
 {
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index a64304d1027..f344b5432a2 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -104,11 +104,10 @@ TEST_F(StringsPadTest, ZeroSizeStringsColumn)
   cudf::test::expect_strings_empty(results->view());
 }
 
-class StringsPadParmsTest : public StringsPadTest,
-                            public testing::WithParamInterface<cudf::size_type> {
+class PadParameters : public StringsPadTest, public testing::WithParamInterface<cudf::size_type> {
 };
 
-TEST_P(StringsPadParmsTest, Padding)
+TEST_P(PadParameters, Padding)
 {
   std::vector<std::string> h_strings{"eee ddd", "bb cc", "aa", "bbb", "fff", "", "o"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
@@ -128,8 +127,8 @@ TEST_P(StringsPadParmsTest, Padding)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-INSTANTIATE_TEST_CASE_P(StringsPadParmWidthTest,
-                        StringsPadParmsTest,
+INSTANTIATE_TEST_CASE_P(StringsPadTest,
+                        PadParameters,
                         testing::ValuesIn(std::array<cudf::size_type, 3>{5, 6, 7}));
 
 TEST_F(StringsPadTest, ZFill)
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 1f01f0f1429..fc1c20d8719 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@
 
 #include <vector>
 
-struct StringsReplaceTests : public cudf::test::BaseFixture {
+struct StringsReplaceRegexTest : public cudf::test::BaseFixture {
 };
 
-TEST_F(StringsReplaceTests, ReplaceRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -59,7 +59,7 @@ TEST_F(StringsReplaceTests, ReplaceRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceMultiRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -95,7 +95,7 @@ TEST_F(StringsReplaceTests, ReplaceMultiRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, InvalidRegex)
+TEST_F(StringsReplaceRegexTest, InvalidRegex)
 {
   cudf::test::strings_column_wrapper strings(
     {"abc*def|ghi+jkl", ""});  // these do not really matter
@@ -116,7 +116,7 @@ TEST_F(StringsReplaceTests, InvalidRegex)
                cudf::logic_error);
 }
 
-TEST_F(StringsReplaceTests, WithEmptyPattern)
+TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
 {
   std::vector<const char*> h_strings{"asd", "xcv"};
   cudf::test::strings_column_wrapper strings(
@@ -133,7 +133,7 @@ TEST_F(StringsReplaceTests, WithEmptyPattern)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",
                                      "the fat cat lays next to the other accénted cat",
@@ -167,7 +167,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest)
 {
   cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
   auto strings_view = cudf::strings_column_view(strings);
@@ -181,7 +181,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexAltIndexPatternTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"});
@@ -200,7 +200,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexReversedTest)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier)
+TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier)
 {
   cudf::test::strings_column_wrapper input(
     {"<h1>title</h1><h2>ABC</h2>", "<h1>1234567</h1><h2>XYZ</h2>"});
@@ -217,7 +217,7 @@ TEST_F(StringsReplaceTests, BackrefWithGreedyQuantifier)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest)
+TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
 {
   cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
   auto view = cudf::strings_column_view(strings);
@@ -228,7 +228,7 @@ TEST_F(StringsReplaceTests, ReplaceBackrefsRegexErrorTest)
   EXPECT_THROW(cudf::strings::replace_with_backrefs(view, "(\\w)", ""), cudf::logic_error);
 }
 
-TEST_F(StringsReplaceTests, MediumReplaceRegex)
+TEST_F(StringsReplaceRegexTest, MediumReplaceRegex)
 {
   // This results in 95 regex instructions and falls in the 'medium' range.
   std::string medium_regex =
@@ -256,7 +256,7 @@ TEST_F(StringsReplaceTests, MediumReplaceRegex)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringsReplaceTests, LargeReplaceRegex)
+TEST_F(StringsReplaceRegexTest, LargeReplaceRegex)
 {
   // This results in 117 regex instructions and falls in the 'large' range.
   std::string large_regex =
diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp
index f9a71407a0d..448b61300fd 100644
--- a/cpp/tests/strings/substring_tests.cpp
+++ b/cpp/tests/strings/substring_tests.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
@@ -28,8 +28,6 @@
 #include <string>
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsSubstringsTest : public cudf::test::BaseFixture {
 };
 
@@ -51,11 +49,11 @@ TEST_F(StringsSubstringsTest, Substring)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-class SubstringParmsTest : public StringsSubstringsTest,
-                           public testing::WithParamInterface<cudf::size_type> {
+class Parameters : public StringsSubstringsTest,
+                   public testing::WithParamInterface<cudf::size_type> {
 };
 
-TEST_P(SubstringParmsTest, Substring)
+TEST_P(Parameters, Substring)
 {
   std::vector<std::string> h_strings{"basic strings", "that can", "be used", "with STL"};
   cudf::size_type start = GetParam();
@@ -72,7 +70,7 @@ TEST_P(SubstringParmsTest, Substring)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, Substring_From)
+TEST_P(Parameters, Substring_From)
 {
   std::vector<std::string> h_strings{"basic strings", "that can", "be used", "with STL"};
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
@@ -96,7 +94,7 @@ TEST_P(SubstringParmsTest, Substring_From)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, AllEmpty)
+TEST_P(Parameters, AllEmpty)
 {
   std::vector<std::string> h_strings{"", "", "", ""};
   cudf::size_type start = GetParam();
@@ -118,7 +116,7 @@ TEST_P(SubstringParmsTest, AllEmpty)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_P(SubstringParmsTest, AllNulls)
+TEST_P(Parameters, AllNulls)
 {
   std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
   cudf::test::strings_column_wrapper strings(
@@ -147,23 +145,9 @@ TEST_P(SubstringParmsTest, AllNulls)
 }
 
 INSTANTIATE_TEST_CASE_P(StringsSubstringsTest,
-                        SubstringParmsTest,
+                        Parameters,
                         testing::ValuesIn(std::array<cudf::size_type, 3>{1, 2, 3}));
 
-TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_column = cudf::strings_column_view(zero_size_strings_column);
-  auto results        = cudf::strings::slice_strings(strings_column, 1, 2);
-  cudf::test::expect_strings_empty(results->view());
-
-  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
-  results = cudf::strings::slice_strings(strings_column, starts_column, stops_column);
-  cudf::test::expect_strings_empty(results->view());
-}
-
 TEST_F(StringsSubstringsTest, NegativePositions)
 {
   cudf::test::strings_column_wrapper strings{
@@ -270,34 +254,59 @@ TEST_F(StringsSubstringsTest, MaxPositions)
 TEST_F(StringsSubstringsTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};
-  auto strings_column = cudf::strings_column_view(strings);
-  EXPECT_THROW(cudf::strings::slice_strings(strings_column, 0, 0, 0), cudf::logic_error);
-}
+  auto strings_view = cudf::strings_column_view(strings);
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, 0, 0, 0), cudf::logic_error);
 
-struct StringsSubstringsScalarDelimiterTest : public cudf::test::BaseFixture {
-};
+  auto delim_col = cudf::test::strings_column_wrapper({"", ""});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, -1),
+               cudf::logic_error);
+
+  auto indexes = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error);
 
-TEST_F(StringsSubstringsScalarDelimiterTest, ZeroSizeStringsColumn)
+  auto indexes_null = cudf::test::fixed_width_column_wrapper<int32_t>({1}, {0});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_null, indexes_null),
+               cudf::logic_error);
+
+  auto indexes_bad = cudf::test::fixed_width_column_wrapper<float>({1});
+  EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_bad, indexes_bad),
+               cudf::logic_error);
+}
+
+TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn)
 {
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(col0);
+  cudf::column_view zero_size_strings_column(
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
+
+  auto results = cudf::strings::slice_strings(strings_view, 1, 2);
+  cudf::test::expect_strings_empty(results->view());
+
+  results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
+  cudf::test::expect_strings_empty(results->view());
+
+  cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
+  cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
+  results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
+  cudf::test::expect_strings_empty(results->view());
 
-  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
+  results = cudf::strings::slice_strings(strings_view, strings_view, 1);
   cudf::test::expect_strings_empty(results->view());
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, AllEmpty)
+TEST_F(StringsSubstringsTest, AllEmpty)
 {
   auto strings_col  = cudf::test::strings_column_wrapper({"", "", "", "", ""});
   auto strings_view = cudf::strings_column_view(strings_col);
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""});
+  auto exp_results  = cudf::column_view(strings_col);
 
   auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("e"), -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+  results = cudf::strings::slice_strings(strings_view, strings_view, -1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter)
+TEST_F(StringsSubstringsTest, EmptyDelimiter)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -306,11 +315,18 @@ TEST_F(StringsSubstringsScalarDelimiterTest, EmptyDelimiter)
 
   auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
                                                         {true, true, false, true, true, true});
-  auto results     = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+
+  auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar(""), 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+
+  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
+                                                      {true, false, true, false, true, false});
+
+  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount)
+TEST_F(StringsSubstringsTest, ZeroCount)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -321,10 +337,16 @@ TEST_F(StringsSubstringsScalarDelimiterTest, ZeroCount)
                                                         {true, true, false, true, true, true});
 
   auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
+
+  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
+                                                      {true, false, true, false, true, false});
+
+  results = cudf::strings::slice_strings(strings_view, cudf::strings_column_view{delim_col}, 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
 }
 
-TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
+TEST_F(StringsSubstringsTest, SearchScalarDelimiter)
 {
   auto strings_col = cudf::test::strings_column_wrapper(
     {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
@@ -336,7 +358,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
                                                           {true, true, false, true, true, true});
 
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -344,17 +366,17 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
       {"llo", "", "", "lease", "st strings", ""}, {true, true, false, true, true, true});
 
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
   }
 
   {
     auto results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("é"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_view.parent(), verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings_col);
   }
 
   {
@@ -367,7 +389,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), 2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -380,7 +402,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("o"), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -394,7 +416,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -408,7 +430,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("éé"), -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -424,7 +446,7 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar("."), 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -441,76 +463,11 @@ TEST_F(StringsSubstringsScalarDelimiterTest, SearchDelimiter)
 
     auto results =
       cudf::strings::slice_strings(cudf::strings_column_view{col0}, cudf::string_scalar(".."), -2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 }
 
-struct StringsSubstringsColumnDelimiterTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ZeroSizeStringsColumn)
-{
-  cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(col0);
-
-  auto results = cudf::strings::slice_strings(strings_view, strings_view, 1);
-  // Check empty column
-  cudf::test::expect_strings_empty(results->view());
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, GenerateExceptions)
-{
-  auto col0      = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", "."});
-
-  EXPECT_THROW(cudf::strings::slice_strings(
-                 cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1),
-               cudf::logic_error);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ColumnAllEmpty)
-{
-  auto col0      = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "foo", "bar", ".", "/"});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", ""});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, DelimiterAllEmptyAndInvalid)
-{
-  auto col0 = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, ZeroDelimiterCount)
-{
-  auto col0 = cudf::test::strings_column_wrapper(
-    {"Héllo", "thesé", "", "lease", "tést strings", ""}, {true, true, false, true, true, true});
-  auto delim_col = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                      {true, false, true, false, true, false});
-
-  auto exp_results = cudf::test::strings_column_wrapper({"", "", "", "", "", ""},
-                                                        {true, true, false, true, true, true});
-
-  auto results = cudf::strings::slice_strings(
-    cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 0);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
-}
-
-TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
+TEST_F(StringsSubstringsTest, SearchColumnDelimiter)
 {
   {
     auto col0 = cudf::test::strings_column_wrapper(
@@ -523,7 +480,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -541,7 +498,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -565,7 +522,7 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, 3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 
   {
@@ -587,6 +544,6 @@ TEST_F(StringsSubstringsColumnDelimiterTest, SearchDelimiter)
 
     auto results = cudf::strings::slice_strings(
       cudf::strings_column_view{col0}, cudf::strings_column_view{delim_col}, -3);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, verbosity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
   }
 }
diff --git a/cpp/tests/strings/utilities.cu b/cpp/tests/strings/utilities.cpp
similarity index 100%
rename from cpp/tests/strings/utilities.cu
rename to cpp/tests/strings/utilities.cpp
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index a94a35e8896..aa7d66dd633 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -60,7 +60,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::DurationTypes,
                                                   cudf::test::TimestampTypes>;
 
-TYPED_TEST_CASE(TypedStructColumnWrapperTest, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(TypedStructColumnWrapperTest, FixedWidthTypesNotBool);
 
 // Test simple struct construction without nullmask, through column factory.
 // Columns must retain their originally set values.
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index 08b40b22aa4..b26ea87c5b8 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <structs/utilities.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -27,6 +25,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
 #include <cudf/null_mask.hpp>
 
 namespace cudf::test {
@@ -39,8 +38,7 @@ void flatten_unflatten_compare(table_view const& input_table)
 {
   using namespace cudf::structs::detail;
 
-  auto [flattened, _, __, ___] =
-    flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE);
+  auto flattened = flatten_nested_columns(input_table, {}, {}, column_nullability::FORCE);
   auto unflattened =
     unflatten_nested_columns(std::make_unique<cudf::table>(flattened), input_table);
 
diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp
index 16783c6e848..477250dda18 100644
--- a/cpp/tests/transform/nans_to_null_test.cpp
+++ b/cpp/tests/transform/nans_to_null_test.cpp
@@ -63,7 +63,7 @@ struct NaNsToNullTest : public cudf::test::BaseFixture {
 
 using test_types = ::testing::Types<float, double>;
 
-TYPED_TEST_CASE(NaNsToNullTest, test_types);
+TYPED_TEST_SUITE(NaNsToNullTest, test_types);
 
 TYPED_TEST(NaNsToNullTest, WithMask)
 {
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
new file mode 100644
index 00000000000..128ffa37d69
--- /dev/null
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
+
+#include <limits>
+
+namespace cudf {
+namespace test {
+
+template <typename T>
+struct OneHotEncodingTestTyped : public BaseFixture {
+};
+
+struct OneHotEncodingTest : public BaseFixture {
+};
+
+TYPED_TEST_SUITE(OneHotEncodingTestTyped, cudf::test::NumericTypes);
+
+TYPED_TEST(OneHotEncodingTestTyped, Basic)
+{
+  auto input    = fixed_width_column_wrapper<int32_t>{8, 8, 8, 9, 9};
+  auto category = fixed_width_column_wrapper<int32_t>{8, 9};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 1, 1, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 0, 0, 1, 1};
+
+  auto expected = table_view{{col0, col1}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TYPED_TEST(OneHotEncodingTestTyped, Nulls)
+{
+  auto input    = fixed_width_column_wrapper<int32_t>{{8, 8, 8, 9, 9}, {1, 1, 0, 1, 1}};
+  auto category = fixed_width_column_wrapper<int32_t>({8, 9, -1}, {1, 1, 0});
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 1, 0, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 0, 0, 1, 1};
+  auto col2 = fixed_width_column_wrapper<bool>{0, 0, 1, 0, 0};
+
+  auto expected = table_view{{col0, col1, col2}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, Diagonal)
+{
+  auto input    = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
+  auto category = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 0, 0, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 1, 0, 0, 0};
+  auto col2 = fixed_width_column_wrapper<bool>{0, 0, 1, 0, 0};
+  auto col3 = fixed_width_column_wrapper<bool>{0, 0, 0, 1, 0};
+  auto col4 = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 1};
+
+  auto expected = table_view{{col0, col1, col2, col3, col4}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, ZeroInput)
+{
+  auto input    = strings_column_wrapper{};
+  auto category = strings_column_wrapper{"rapids", "cudf"};
+
+  auto col0 = fixed_width_column_wrapper<bool>{};
+  auto col1 = fixed_width_column_wrapper<bool>{};
+
+  auto expected = table_view{{col0, col1}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, ZeroCat)
+{
+  auto input    = strings_column_wrapper{"ji", "ji", "ji"};
+  auto category = strings_column_wrapper{};
+
+  auto expected = table_view{};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, ZeroInputCat)
+{
+  auto input    = strings_column_wrapper{};
+  auto category = strings_column_wrapper{};
+
+  auto expected = table_view{};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, OneCat)
+{
+  auto input    = strings_column_wrapper{"ji", "ji", "ji"};
+  auto category = strings_column_wrapper{"ji"};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 1, 1};
+
+  auto expected = table_view{{col0}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, NaNs)
+{
+  auto const nan = std::numeric_limits<float>::signaling_NaN();
+
+  auto input    = fixed_width_column_wrapper<float>{8.f, 8.f, 8.f, 9.f, nan};
+  auto category = fixed_width_column_wrapper<float>{8.f, 9.f, nan};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 1, 1, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 0, 0, 1, 0};
+  auto col2 = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 1};
+
+  auto expected = table_view{{col0, col1, col2}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, Strings)
+{
+  auto input = strings_column_wrapper{
+    {"hello", "rapidsai", "cudf", "hello", "cuspatial", "hello", "world", "!"},
+    {1, 1, 1, 1, 0, 1, 1, 0}};
+  auto category = strings_column_wrapper{{"hello", "world", ""}, {1, 1, 0}};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 0, 0, 1, 0, 1, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 0, 0, 1, 0};
+  auto col2 = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 1, 0, 0, 1};
+
+  auto expected = table_view{{col0, col1, col2}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, Dictionary)
+{
+  auto input    = dictionary_column_wrapper<std::string>{"aa", "xx", "aa", "aa", "yy", "ef"};
+  auto category = dictionary_column_wrapper<std::string>{"aa", "ef"};
+
+  auto col0 = fixed_width_column_wrapper<bool>{1, 0, 1, 1, 0, 0};
+  auto col1 = fixed_width_column_wrapper<bool>{0, 0, 0, 0, 0, 1};
+
+  auto expected = table_view{{col0, col1}};
+
+  [[maybe_unused]] auto [res_ptr, got] = one_hot_encode(input, category);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got);
+}
+
+TEST_F(OneHotEncodingTest, MismatchTypes)
+{
+  auto input    = strings_column_wrapper{"xx", "yy", "xx"};
+  auto category = fixed_width_column_wrapper<int64_t>{1};
+
+  EXPECT_THROW(one_hot_encode(input, category), cudf::logic_error);
+}
+
+}  // namespace test
+}  // namespace cudf
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 8284def5f13..f718fbfc57b 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -36,7 +36,7 @@ template <typename T>
 struct RowBitCountTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(RowBitCountTyped, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(RowBitCountTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(RowBitCountTyped, SimpleTypes)
 {
@@ -87,14 +87,21 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_list_column()
   using LCW                     = cudf::test::lists_column_wrapper<T, int>;
   constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
 
-  // clang-format off
-  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {3, 4, 5}}, 
-                                                LCW{LCW{}}, 
-                                                {LCW{10}},
-                                                {{6, 7, 8}, {9}},
-                                                {{-1, -2}, {-3, -4}},
-                                                {{-5, -6, -7}, {-8, -9}} };
-  // clang-format on
+  // {
+  //  {{1, 2}, {3, 4, 5}},
+  //  {{}},
+  //  {LCW{10}},
+  //  {{6, 7, 8}, {9}},
+  //  {{-1, -2}, {-3, -4}},
+  //  {{-5, -6, -7}, {-8, -9}}
+  // }
+  cudf::test::fixed_width_column_wrapper<T> values{
+    1, 2, 3, 4, 5, 10, 6, 7, 8, 9, -1, -2, -3, -4, -5, -6, -7, -8, -9};
+  cudf::test::fixed_width_column_wrapper<offset_type> inner_offsets{
+    0, 2, 5, 6, 9, 10, 12, 14, 17, 19};
+  auto inner_list = cudf::make_lists_column(9, inner_offsets.release(), values.release(), 0, {});
+  cudf::test::fixed_width_column_wrapper<offset_type> outer_offsets{0, 2, 2, 3, 5, 7, 9};
+  auto list = cudf::make_lists_column(6, outer_offsets.release(), std::move(inner_list), 0, {});
 
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf
   cudf::test::fixed_width_column_wrapper<size_type> expected{
@@ -105,7 +112,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> build_list_column()
     ((4 + 8) * CHAR_BIT) + (type_size * 4),
     ((4 + 8) * CHAR_BIT) + (type_size * 5)};
 
-  return {col.release(), expected.release()};
+  return {std::move(list), expected.release()};
 }
 
 TYPED_TEST(RowBitCountTyped, Lists)
@@ -128,18 +135,26 @@ TYPED_TEST(RowBitCountTyped, ListsWithNulls)
   using LCW                     = cudf::test::lists_column_wrapper<T, int>;
   constexpr size_type type_size = sizeof(device_storage_type_t<T>) * CHAR_BIT;
 
-  std::vector<bool> valids{true, false, true};
-  std::vector<bool> valids2{false, true, false};
-  std::vector<bool> valids3{true, false};
-
-  // clang-format off
-  cudf::test::lists_column_wrapper<T, int> col{ {{1, 2}, {{3, 4, 5}, valids.begin()}}, 
-                                                LCW{LCW{}}, 
-                                                {LCW{10}}, 
-                                                {{{{6, 7, 8}, valids2.begin()}, {9}}, valids3.begin()} };
-  // clang-format on
-
-  table_view t({col});
+  // {
+  //  {{1, 2}, {3, null, 5}},
+  //  {{}},
+  //  {LCW{10}},
+  //  {{null, 7, null}, null},
+  // }
+  cudf::test::fixed_width_column_wrapper<T> values{{1, 2, 3, 4, 5, 10, 6, 7, 8},
+                                                   {1, 1, 1, 0, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<offset_type> inner_offsets{0, 2, 5, 6, 9, 9};
+  std::vector<bool> inner_list_validity{1, 1, 1, 1, 0};
+  auto inner_list = cudf::make_lists_column(
+    5,
+    inner_offsets.release(),
+    values.release(),
+    1,
+    cudf::test::detail::make_null_mask(inner_list_validity.begin(), inner_list_validity.end()));
+  cudf::test::fixed_width_column_wrapper<offset_type> outer_offsets{0, 2, 2, 3, 5};
+  auto list = cudf::make_lists_column(4, outer_offsets.release(), std::move(inner_list), 0, {});
+
+  table_view t({*list});
   auto result = cudf::row_bit_count(t);
 
   // expected size = (num rows at level 1 + num_rows at level 2) + # values in the leaf + validity
@@ -704,8 +719,8 @@ TEST_F(RowBitCount, EmptyTable)
   }
 
   {
-    auto strings = cudf::make_empty_column(data_type{type_id::STRING});
-    auto ints    = cudf::make_empty_column(data_type{type_id::INT32});
+    auto strings = cudf::make_empty_column(type_id::STRING);
+    auto ints    = cudf::make_empty_column(type_id::INT32);
     cudf::table_view empty({*strings, *ints});
 
     auto result = cudf::row_bit_count(empty);
diff --git a/cpp/tests/transpose/transpose_test.cpp b/cpp/tests/transpose/transpose_test.cpp
index 8d81f4dce84..7b7b7d8a4a9 100644
--- a/cpp/tests/transpose/transpose_test.cpp
+++ b/cpp/tests/transpose/transpose_test.cpp
@@ -158,7 +158,7 @@ template <typename T>
 class TransposeTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(TransposeTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(TransposeTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(TransposeTest, SingleValue) { run_test<TypeParam>(1, 1, false); }
 
diff --git a/cpp/tests/types/traits_test.cpp b/cpp/tests/types/traits_test.cpp
index 7f38fbf7106..d1afa200f59 100644
--- a/cpp/tests/types/traits_test.cpp
+++ b/cpp/tests/types/traits_test.cpp
@@ -43,7 +43,7 @@ template <typename T>
 class TypedTraitsTest : public TraitsTest {
 };
 
-TYPED_TEST_CASE(TypedTraitsTest, cudf::test::AllTypes);
+TYPED_TEST_SUITE(TypedTraitsTest, cudf::test::AllTypes);
 
 TEST_F(TraitsTest, NumericDataTypesAreNumeric)
 {
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 9f8301e31a0..dca80b597c0 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -32,7 +32,7 @@ template <typename T>
 struct TypedDispatcherTest : public DispatcherTest {
 };
 
-TYPED_TEST_CASE(TypedDispatcherTest, cudf::test::AllTypes);
+TYPED_TEST_SUITE(TypedDispatcherTest, cudf::test::AllTypes);
 
 namespace {
 template <typename Expected>
@@ -90,7 +90,7 @@ template <typename T>
 struct TypedDoubleDispatcherTest : public DispatcherTest {
 };
 
-TYPED_TEST_CASE(TypedDoubleDispatcherTest, cudf::test::AllTypes);
+TYPED_TEST_SUITE(TypedDoubleDispatcherTest, cudf::test::AllTypes);
 
 namespace {
 template <typename Expected1, typename Expected2>
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index f3b0a231b34..b57ccdd816a 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -274,7 +274,7 @@ template <typename T>
 struct CastChronosTyped : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(CastChronosTyped, cudf::test::ChronoTypes);
+TYPED_TEST_SUITE(CastChronosTyped, cudf::test::ChronoTypes);
 
 // Return a list of chrono type ids whose precision is greater than or equal
 // to the input type id
@@ -446,7 +446,7 @@ template <typename T>
 struct CastToDurations : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(CastToDurations, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(CastToDurations, cudf::test::IntegralTypes);
 
 TYPED_TEST(CastToDurations, AllValid)
 {
@@ -479,7 +479,7 @@ template <typename T>
 struct CastFromDurations : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(CastFromDurations, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(CastFromDurations, cudf::test::NumericTypes);
 
 TYPED_TEST(CastFromDurations, AllValid)
 {
@@ -554,7 +554,7 @@ template <typename T>
 struct FixedPointTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointTests, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointTests, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointTests, CastToDouble)
 {
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index 08a40edb776..b0c59b4354a 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -32,7 +32,7 @@ template <typename T>
 struct UnaryLogicalOpsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(UnaryLogicalOpsTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(UnaryLogicalOpsTest, cudf::test::NumericTypes);
 
 TYPED_TEST(UnaryLogicalOpsTest, LogicalNot)
 {
@@ -89,7 +89,7 @@ template <typename T>
 struct UnaryMathOpsTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(UnaryMathOpsTest, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(UnaryMathOpsTest, cudf::test::NumericTypes);
 
 TYPED_TEST(UnaryMathOpsTest, ABS)
 {
@@ -238,7 +238,7 @@ struct UnaryMathFloatOpsTest : public cudf::test::BaseFixture {
 
 using floating_point_type_list = ::testing::Types<float, double>;
 
-TYPED_TEST_CASE(UnaryMathFloatOpsTest, floating_point_type_list);
+TYPED_TEST_SUITE(UnaryMathFloatOpsTest, floating_point_type_list);
 
 TYPED_TEST(UnaryMathFloatOpsTest, SimpleSIN)
 {
diff --git a/cpp/tests/unary/unary_ops_test.cpp b/cpp/tests/unary/unary_ops_test.cpp
index 0bb6bf740f5..664322a386f 100644
--- a/cpp/tests/unary/unary_ops_test.cpp
+++ b/cpp/tests/unary/unary_ops_test.cpp
@@ -58,7 +58,7 @@ template <typename T>
 struct IsNull : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(IsNull, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(IsNull, cudf::test::NumericTypes);
 
 TYPED_TEST(IsNull, AllValid)
 {
@@ -109,7 +109,7 @@ template <typename T>
 struct IsNotNull : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(IsNotNull, cudf::test::NumericTypes);
+TYPED_TEST_SUITE(IsNotNull, cudf::test::NumericTypes);
 
 TYPED_TEST(IsNotNull, AllValid)
 {
@@ -160,7 +160,7 @@ template <typename T>
 struct IsNAN : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(IsNAN, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(IsNAN, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(IsNAN, AllValid)
 {
@@ -213,7 +213,7 @@ template <typename T>
 struct IsNotNAN : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(IsNotNAN, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(IsNotNAN, cudf::test::FloatingPointTypes);
 
 TYPED_TEST(IsNotNAN, AllValid)
 {
@@ -266,7 +266,7 @@ template <typename T>
 struct FixedPointUnaryTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(FixedPointUnaryTests, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(FixedPointUnaryTests, cudf::test::FixedPointTypes);
 
 TYPED_TEST(FixedPointUnaryTests, FixedPointUnaryAbs)
 {
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 0f10d6efe4a..8341425e9e7 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -809,7 +809,7 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
 std::vector<bitmask_type> bitmask_to_host(cudf::column_view const& c)
 {
   if (c.nullable()) {
-    auto num_bitmasks = bitmask_allocation_size_bytes(c.size()) / sizeof(bitmask_type);
+    auto num_bitmasks = num_bitmask_words(c.size());
     std::vector<bitmask_type> host_bitmask(num_bitmasks);
     if (c.offset() == 0) {
       CUDA_TRY(cudaMemcpy(host_bitmask.data(),
@@ -940,11 +940,22 @@ struct column_view_printer {
                   std::vector<std::string>& out,
                   std::string const& indent)
   {
-    //
     //  For timestamps, convert timestamp column to column of strings, then
     //  call string version
-    //
-    auto col_as_strings = cudf::strings::from_timestamps(col);
+    std::string format = [&]() {
+      if constexpr (std::is_same_v<cudf::timestamp_s, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%SZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ms, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%3fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_us, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%6fZ"};
+      } else if constexpr (std::is_same_v<cudf::timestamp_ns, Element>) {
+        return std::string{"%Y-%m-%dT%H:%M:%S.%9fZ"};
+      }
+      return std::string{"%Y-%m-%d"};
+    }();
+
+    auto col_as_strings = cudf::strings::from_timestamps(col, format);
     if (col_as_strings->size() == 0) { return; }
 
     this->template operator()<cudf::string_view>(*col_as_strings, out, indent);
diff --git a/cpp/tests/utilities_tests/column_utilities_tests.cpp b/cpp/tests/utilities_tests/column_utilities_tests.cpp
index 0dc10f9d717..082f493da7d 100644
--- a/cpp/tests/utilities_tests/column_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/column_utilities_tests.cpp
@@ -50,10 +50,10 @@ template <typename T>
 struct ColumnUtilitiesTestFixedPoint : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ColumnUtilitiesTest, cudf::test::FixedWidthTypes);
-TYPED_TEST_CASE(ColumnUtilitiesTestIntegral, cudf::test::IntegralTypes);
-TYPED_TEST_CASE(ColumnUtilitiesTestFloatingPoint, cudf::test::FloatingPointTypes);
-TYPED_TEST_CASE(ColumnUtilitiesTestFixedPoint, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(ColumnUtilitiesTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ColumnUtilitiesTestIntegral, cudf::test::IntegralTypes);
+TYPED_TEST_SUITE(ColumnUtilitiesTestFloatingPoint, cudf::test::FloatingPointTypes);
+TYPED_TEST_SUITE(ColumnUtilitiesTestFixedPoint, cudf::test::FixedPointTypes);
 
 TYPED_TEST(ColumnUtilitiesTest, NonNullableToHost)
 {
@@ -133,7 +133,7 @@ TYPED_TEST(ColumnUtilitiesTest, NullableToHostAllValid)
 
   auto masks = cudf::test::detail::make_null_mask_vector(all_valid, all_valid + size);
 
-  EXPECT_TRUE(std::equal(masks.begin(), masks.end(), host_data.second.begin()));
+  EXPECT_TRUE(cudf::test::validate_host_masks(masks, host_data.second, size));
 }
 
 struct ColumnUtilitiesEquivalenceTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/utilities_tests/column_wrapper_tests.cpp b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
index 6c799b8d632..9d2cc257f02 100644
--- a/cpp/tests/utilities_tests/column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/column_wrapper_tests.cpp
@@ -32,7 +32,7 @@ struct FixedWidthColumnWrapperTest : public cudf::test::BaseFixture,
   auto data_type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
-TYPED_TEST_CASE(FixedWidthColumnWrapperTest, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(FixedWidthColumnWrapperTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(FixedWidthColumnWrapperTest, EmptyIterator)
 {
@@ -229,7 +229,7 @@ struct StringsColumnWrapperTest : public cudf::test::BaseFixture,
   auto data_type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
 
-TYPED_TEST_CASE(StringsColumnWrapperTest, cudf::test::StringTypes);
+TYPED_TEST_SUITE(StringsColumnWrapperTest, cudf::test::StringTypes);
 
 TYPED_TEST(StringsColumnWrapperTest, EmptyList)
 {
diff --git a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
index f40d8c796f3..d2578044aae 100644
--- a/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
+++ b/cpp/tests/utilities_tests/lists_column_wrapper_tests.cpp
@@ -43,7 +43,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat<cudf::test::IntegralTypesNotBo
                                                   cudf::test::FloatingPointTypes,
                                                   cudf::test::DurationTypes,
                                                   cudf::test::TimestampTypes>;
-TYPED_TEST_CASE(ListColumnWrapperTestTyped, FixedWidthTypesNotBool);
+TYPED_TEST_SUITE(ListColumnWrapperTestTyped, FixedWidthTypesNotBool);
 
 TYPED_TEST(ListColumnWrapperTestTyped, List)
 {
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index bd94f724776..84a2d15d477 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -33,7 +33,7 @@ struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {
 struct ColumnTypeCheckTest : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_CASE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
+TYPED_TEST_SUITE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
 {
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 64d9ad6fc3f..b458f34cca8 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -69,7 +69,7 @@ struct compare_chrono_elements_to_primitive_representation {
   }
 };
 
-TYPED_TEST_CASE(ChronoColumnTest, cudf::test::ChronoTypes);
+TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes);
 
 TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
 {
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 12ff1f13bc4..76026c23d50 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -75,9 +75,12 @@ Binary operator functions
 
    DataFrame.add
    DataFrame.sub
+   DataFrame.subtract
    DataFrame.mul
-   DataFrame.div
+   DataFrame.multiply
    DataFrame.truediv
+   DataFrame.div
+   DataFrame.divide
    DataFrame.floordiv
    DataFrame.mod
    DataFrame.pow
@@ -89,6 +92,14 @@ Binary operator functions
    DataFrame.rfloordiv
    DataFrame.rmod
    DataFrame.rpow
+   DataFrame.round
+   DataFrame.lt
+   DataFrame.gt
+   DataFrame.le
+   DataFrame.ge
+   DataFrame.ne
+   DataFrame.eq
+   DataFrame.product
 
 Function application, GroupBy & window
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -192,6 +203,7 @@ Reshaping, sorting, transposing
    DataFrame.unstack
    DataFrame.melt
    DataFrame.explode
+   DataFrame.to_struct
    DataFrame.T
    DataFrame.transpose
 
@@ -241,6 +253,7 @@ Serialization / IO / conversion
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_columns
+   DataFrame.hash_values
    DataFrame.to_arrow
    DataFrame.to_dlpack
    DataFrame.to_parquet
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index 27a314fa425..cf08d1d791b 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -77,6 +77,7 @@ application to columns of a specific data type.
    DataFrameGroupBy.cummin
    DataFrameGroupBy.cumsum
    DataFrameGroupBy.describe
+   DataFrameGroupBy.diff
    DataFrameGroupBy.ffill
    DataFrameGroupBy.fillna
    DataFrameGroupBy.idxmax
diff --git a/docs/cudf/source/api_docs/index.rst b/docs/cudf/source/api_docs/index.rst
index 0bf1d11bff4..87cb32fda36 100644
--- a/docs/cudf/source/api_docs/index.rst
+++ b/docs/cudf/source/api_docs/index.rst
@@ -18,4 +18,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     window
     io
     subword_tokenize
-
+    string_handling
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index c23c9a3f6c1..30269bb2a72 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -23,6 +23,7 @@ Properties
 
    Index.empty
    Index.gpu_values
+   Index.has_duplicates
    Index.is_monotonic
    Index.is_monotonic_increasing
    Index.is_monotonic_decreasing
@@ -46,6 +47,14 @@ Modifying and computations
    Index.drop_duplicates
    Index.equals
    Index.factorize
+   Index.is_boolean
+   Index.is_categorical
+   Index.is_floating
+   Index.is_integer
+   Index.is_interval
+   Index.is_mixed
+   Index.is_numeric
+   Index.is_object
    Index.min
    Index.max
    Index.rename
@@ -84,9 +93,15 @@ Conversion
    :toctree: api/
 
    Index.astype
+   Index.to_array
+   Index.to_arrow
    Index.to_list
    Index.to_series
    Index.to_frame
+   Index.to_pandas
+   Index.to_dlpack
+   Index.from_pandas
+   Index.from_arrow
 
 Sorting
 ~~~~~~~
@@ -110,6 +125,8 @@ Combining / joining / set operations
    :toctree: api/
 
    Index.append
+   Index.union
+   Index.intersection
    Index.join
    Index.difference
 
@@ -249,7 +266,13 @@ Time/date components
    DatetimeIndex.minute
    DatetimeIndex.second
    DatetimeIndex.dayofweek
+   DatetimeIndex.dayofyear
+   DatetimeIndex.day_of_year
    DatetimeIndex.weekday
+   DatetimeIndex.is_leap_year
+   DatetimeIndex.quarter
+   DatetimeIndex.isocalendar
+
 
 Time-specific operations
 ~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index ffa809268f3..46a31a0dcf6 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -44,7 +44,6 @@ Conversion
    Series.copy
    Series.to_list
    Series.__array__
-   Series.as_index
    Series.as_mask
    Series.scale
 
@@ -75,12 +74,15 @@ Binary operator functions
    Series.mul
    Series.multiply
    Series.truediv
+   Series.div
+   Series.divide
    Series.floordiv
    Series.mod
    Series.pow
    Series.radd
    Series.rsub
    Series.rmul
+   Series.rdiv
    Series.rtruediv
    Series.rfloordiv
    Series.rmod
@@ -99,6 +101,7 @@ Function application, GroupBy & window
 .. autosummary::
    :toctree: api/
 
+   Series.apply
    Series.applymap
    Series.map
    Series.groupby
@@ -250,6 +253,7 @@ Datetime, Timedelta         :ref:`dt <api.series.dt>`
 String                      :ref:`str <api.series.str>`
 Categorical                 :ref:`cat <api.series.cat>`
 List                        :ref:`list <api.series.list>`
+Struct                      :ref:`struct <api.series.struct>`
 =========================== =================================
 
 .. _api.series.dt:
@@ -270,12 +274,23 @@ Datetime properties
 
    day
    dayofweek
+   dayofyear
+   days_in_month
+   day_of_year
    hour
    minute
    month
    second
    weekday
    year
+   is_leap_year
+   is_month_start
+   is_month_end
+   is_quarter_start
+   is_quarter_end
+   is_year_start
+   is_year_end
+   quarter
 
 Datetime methods
 ^^^^^^^^^^^^^^^^
@@ -284,6 +299,7 @@ Datetime methods
    :toctree: api/
 
    strftime
+   isocalendar
 
 
 Timedelta properties
@@ -299,99 +315,8 @@ Timedelta properties
    nanoseconds
    seconds
 
-
 .. _api.series.str:
-
-String handling
-~~~~~~~~~~~~~~~
-
-``Series.str`` can be used to access the values of the series as
-strings and apply several methods to it. These can be accessed like
-``Series.str.<function/property>``.
-
-.. currentmodule:: cudf.core.column.string.StringMethods
-.. autosummary::
-   :toctree: api/
-
-   byte_count
-   capitalize
-   cat
-   center
-   character_ngrams
-   character_tokenize
-   code_points
-   contains
-   count
-   detokenize
-   edit_distance
-   endswith
-   extract
-   filter_alphanum
-   filter_characters
-   filter_tokens
-   find
-   findall
-   get
-   get_json_object
-   htoi
-   index
-   insert
-   ip2int
-   is_consonant
-   is_vowel
-   isalnum
-   isalpha
-   isdecimal
-   isdigit
-   isempty
-   isfloat
-   ishex
-   isinteger
-   isipv4
-   isspace
-   islower
-   isnumeric
-   isupper
-   istimestamp
-   join
-   len
-   ljust
-   lower
-   lstrip
-   match
-   ngrams
-   ngrams_tokenize
-   normalize_characters
-   pad
-   partition
-   porter_stemmer_measure
-   replace
-   replace_tokens
-   replace_with_backrefs
-   rfind
-   rindex
-   rjust
-   rpartition
-   rstrip
-   slice
-   slice_from
-   slice_replace
-   split
-   rsplit
-   startswith
-   strip
-   subword_tokenize
-   swapcase
-   title
-   token_count
-   tokenize
-   translate
-   upper
-   url_decode
-   url_encode
-   wrap
-   zfill
-   
+.. include:: string_handling.rst
 
 
 ..
@@ -454,6 +379,23 @@ lists and apply list methods to it. These can be accessed like
    unique
 
 
+.. _api.series.struct:
+
+Struct handling
+~~~~~~~~~~~~~~~
+
+``Series.struct`` can be used to access the values of the series as
+Structs and apply struct methods to it. These can be accessed like
+``Series.struct.<function/property>``.
+
+.. currentmodule:: cudf.core.column.struct.StructMethods
+.. autosummary::
+   :toctree: api/
+
+   field
+   explode
+
+
 Serialization / IO / conversion
 -------------------------------
 .. currentmodule:: cudf
@@ -475,4 +417,4 @@ Serialization / IO / conversion
    Series.from_pandas
    Series.hash_encode
    Series.hash_values
-   
\ No newline at end of file
+   
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
new file mode 100644
index 00000000000..3087bcaa826
--- /dev/null
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -0,0 +1,97 @@
+String handling
+~~~~~~~~~~~~~~~
+
+``Series.str`` can be used to access the values of the series as
+strings and apply several methods to it. These can be accessed like
+``Series.str.<function/property>``.
+
+.. currentmodule:: cudf.core.column.string.StringMethods
+.. autosummary::
+   :toctree: api/
+
+   byte_count
+   capitalize
+   cat
+   center
+   character_ngrams
+   character_tokenize
+   code_points
+   contains
+   count
+   detokenize
+   edit_distance
+   edit_distance_matrix
+   endswith
+   extract
+   filter_alphanum
+   filter_characters
+   filter_tokens
+   find
+   findall
+   get
+   get_json_object
+   hex_to_int
+   htoi
+   index
+   insert
+   ip2int
+   ip_to_int
+   is_consonant
+   is_vowel
+   isalnum
+   isalpha
+   isdecimal
+   isdigit
+   isempty
+   isfloat
+   ishex
+   isinteger
+   isipv4
+   isspace
+   islower
+   isnumeric
+   isupper
+   istimestamp
+   istitle
+   join
+   len
+   ljust
+   lower
+   lstrip
+   match
+   ngrams
+   ngrams_tokenize
+   normalize_characters
+   normalize_spaces
+   pad
+   partition
+   porter_stemmer_measure
+   repeat
+   replace
+   replace_tokens
+   replace_with_backrefs
+   rfind
+   rindex
+   rjust
+   rpartition
+   rsplit
+   rstrip
+   slice
+   slice_from
+   slice_replace
+   split
+   rsplit
+   startswith
+   strip
+   subword_tokenize
+   swapcase
+   title
+   token_count
+   tokenize
+   translate
+   upper
+   url_decode
+   url_encode
+   wrap
+   zfill
+   
diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst
index ee63f67daa2..cae7d017291 100644
--- a/docs/cudf/source/basics/basics.rst
+++ b/docs/cudf/source/basics/basics.rst
@@ -36,7 +36,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
     | Boolean                |                  | np.bool_                                                                            | ``'bool'``                                  |
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
-    | Decimal                | Decimal64Dtype   | (none)                                                                              | (none)                                      |
+    | Decimal                | Decimal32Dtype,  | (none)                                                                              | (none)                                      |
+    |                        | Decimal64Dtype   |                                                                                     |                                             |
     +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+
 
 **Note: All dtypes above are Nullable**
diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst
index d4d41d02dee..29cbc2024fc 100644
--- a/docs/cudf/source/basics/io-gds-integration.rst
+++ b/docs/cudf/source/basics/io-gds-integration.rst
@@ -7,10 +7,13 @@ GDS also has a compatibility mode that allows the library to fall back to copyin
 The SDK is available for download `here <https://developer.nvidia.com/gpudirect-storage>`_.
 
 Use of GPUDirect Storage in cuDF is disabled by default, and can be enabled through environment variable ``LIBCUDF_CUFILE_POLICY``. 
-This variable also controls the GDS compatibility mode. There are two special values for the environment variable:
+This variable also controls the GDS compatibility mode. 
 
-- "GDS": Use of GDS is enabled; GDS compatibility mode is *off*.
-- "ALWAYS": Use of GDS is enabled; GDS compatibility mode is *on*.
+There are three special values for the environment variable:
+
+- "GDS": Enable GDS use; GDS compatibility mode is *off*.
+- "ALWAYS": Enable GDS use; GDS compatibility mode is *on*.
+- "OFF": Compretely disable GDS use.
 
 Any other value (or no value set) will keep the GDS disabled for use in cuDF and IO will be done using cuDF's CPU bounce buffers.
 
@@ -28,4 +31,4 @@ Operations that support the use of GPUDirect Storage:
 - `to_parquet`
 - `to_orc`
 
-NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
\ No newline at end of file
+NOTE: current GDS integration is not fully optimized and enabling GDS will not lead to performance improvements in all cases.
diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst
new file mode 100644
index 00000000000..af89ab5285f
--- /dev/null
+++ b/docs/cudf/source/basics/io-nvcomp-integration.rst
@@ -0,0 +1,26 @@
+nvCOMP Integration
+=============================
+
+Some types of compression/decompression can be performed using either `nvCOMP library <https://github.com/NVIDIA/nvcomp>`_ or the internal implementation. 
+
+Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``.
+
+There are three special values for the environment variable:
+
+- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. 
+- "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations.
+- "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead.
+
+Any other value (or no value set) will result in the same behavior as the "STABLE" option.
+
+
+.. table:: Current policy for nvCOMP use for different types
+    :widths: 20 15 15 15 15 15 15 15 15 15
+
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    |                       |       CSV       |      Parquet    |       JSON       |       ORC       |  AVRO  |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
+    | Compression Type      | Writer | Reader | Writer | Reader | Writer¹ | Reader | Writer | Reader | Reader |
+    +=======================+========+========+========+========+=========+========+========+========+========+
+    | snappy                | ❌     | ❌     | Stable | Stable | ❌      | ❌     | Stable | Stable | ❌     |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+
diff --git a/docs/cudf/source/basics/io-supported-types.rst b/docs/cudf/source/basics/io-supported-types.rst
index 78c1bfb6554..0962113eb25 100644
--- a/docs/cudf/source/basics/io-supported-types.rst
+++ b/docs/cudf/source/basics/io-supported-types.rst
@@ -56,9 +56,11 @@ The following table lists are compatible cudf types for each supported IO format
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
     | datetime64[ns]        | ✅     | ✅     | ✅     | ✅     | ✅      | ✅     | ✅     | ✅     | ✅     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ❌     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
+    | struct                | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ✅      | ✅      | ❌     | ❌     | ✅      | ✅      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
-    | decimal64             | ❌     | ❌     | ✅     | ✅     | ❌      | ❌     | ❌     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    | decimal32             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
+    +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
+    | decimal64             | ✅     | ✅     | ✅     | ✅     | ❌      | ❌     | ✅     | ✅     | ❌     | ❌      | ❌      | ❌     | ❌     | ❌      | ❌      |
     +-----------------------+--------+--------+--------+--------+---------+--------+--------+--------+--------+---------+---------+--------+--------+---------+---------+
 
 **Notes:**
diff --git a/docs/cudf/source/basics/io.rst b/docs/cudf/source/basics/io.rst
index e88162d8f52..ee3d997d664 100644
--- a/docs/cudf/source/basics/io.rst
+++ b/docs/cudf/source/basics/io.rst
@@ -9,4 +9,5 @@ This page contains Input / Output related APIs in cuDF.
    :caption: Contents:
 
    io-supported-types.rst
-   io-gds-integration.rst
\ No newline at end of file
+   io-gds-integration.rst
+   io-nvcomp-integration.rst
\ No newline at end of file
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index 3cd1ac2e0c4..215d11cdbb8 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -22,7 +22,7 @@
     "- CuPy NDArrays\n",
     "- Numba DeviceNDArrays\n",
     "\n",
-    "It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values in a limited fashion. Finally, it demonstrates some newer more general null handling via the `DataFrame.apply` API."
+    "It also demonstrates cuDF's default null handling behavior, and how to write UDFs that can interact with null values in a limited fashion. Finally, it demonstrates some newer more general null handling via the `apply` API."
    ]
   },
   {
@@ -1447,20 +1447,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "More general support for `NA` handling is provided on an experimental basis. While the details of the way this works are out of scope of this guide, the broad strokes of the pipeline are similar to those of `Series.applymap`: Numba is used to translate a standard python function into an operation on the data columns and their masks, and then the reduced and optimized version of this function is runtime compiled and called using the data. \n",
+    "More general support for `NA` handling is provided on an experimental basis. Numba is used to translate a standard python function into an operation on the data columns and their masks, and then the reduced and optimized version of this function is runtime compiled and called using the data. \n",
     "\n",
-    "One advantage of this approach apart from the ability to handle nulls generally in an intuitive manner is it results in a very familiar API to Pandas users. Let's see how this works with an example.\n",
-    "\n",
-    "The key to accessing this API is a decorator: `cudf.core.udf.pipeline.nulludf`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from cudf.core.udf.pipeline import nulludf"
+    "One advantage of this approach apart from the ability to handle nulls generally in an intuitive manner is it results in a very familiar API to Pandas users. Let's see how this works with an example."
    ]
   },
   {
@@ -1472,7 +1461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [
     {
@@ -1527,7 +1516,7 @@
        "2  3     6"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1544,18 +1533,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The entrypoint for UDFs used in this manner is `cudf.DataFrame.apply`. To use it, start by defining a completely standard python function decorated with the decorator `nulludf`:"
+    "The entrypoint for UDFs used in this manner is `cudf.DataFrame.apply`. To use it, start by defining a standard python function designed to accept a single dict-like row of the dataframe:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
-    "@nulludf\n",
-    "def f(x, y):\n",
-    "    return x + y"
+    "def f(row):\n",
+    "    return row['A'] + row['B']"
    ]
   },
   {
@@ -1567,7 +1555,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -1579,31 +1567,25 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(\n",
-    "    lambda row: f(\n",
-    "        row['A'],\n",
-    "        row['B']\n",
-    "    ),\n",
-    "    axis=1\n",
-    ")"
+    "df.apply(f, axis=1)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Advanced users might recognize that cuDF does not actually have a `row` object (a special type of Pandas series that behaves like a dict). The `nulludf` decorator is the key to making this work - it really just rearranges things nicely such that the API works in this way. The same function works the same way in pandas, except without the decorator of course:"
+    "The same function should produce the same result as pandas:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [
     {
@@ -1615,22 +1597,13 @@
        "dtype: object"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 27,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "def g(x, y):\n",
-    "    return x + y\n",
-    "\n",
-    "df.to_pandas(nullable=True).apply(\n",
-    "    lambda row: g(\n",
-    "        row['A'],\n",
-    "        row['B']\n",
-    "    ),\n",
-    "    axis=1\n",
-    ")"
+    "df.to_pandas(nullable=True).apply(f, axis=1)"
    ]
   },
   {
@@ -1649,7 +1622,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -1700,14 +1673,14 @@
        "2     3"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "@nulludf\n",
-    "def f(x):\n",
+    "def f(row):\n",
+    "    x = row['a']\n",
     "    if x is cudf.NA:\n",
     "        return 0\n",
     "    else:\n",
@@ -1719,7 +1692,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
@@ -1731,13 +1704,13 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(lambda row: f(row['a']))"
+    "df.apply(f, axis=1)"
    ]
   },
   {
@@ -1749,7 +1722,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
@@ -1804,14 +1777,15 @@
        "2  3  1"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "@nulludf\n",
-    "def f(x, y):\n",
+    "def f(row):\n",
+    "    x = row['a']\n",
+    "    y = row['b']\n",
     "    if x + y > 3:\n",
     "        return cudf.NA\n",
     "    else:\n",
@@ -1826,7 +1800,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -1838,13 +1812,13 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 32,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(lambda row: f(row['a'], row['b']))"
+    "df.apply(f, axis=1)"
    ]
   },
   {
@@ -1856,7 +1830,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -1911,15 +1885,14 @@
        "2  3  3.14"
       ]
      },
-     "execution_count": 33,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "@nulludf\n",
-    "def f(x, y):\n",
-    "     return x + y\n",
+    "def f(row):\n",
+    "     return row['a'] + row['b']\n",
     "\n",
     "df = cudf.DataFrame({\n",
     "    'a': [1, 2, 3], \n",
@@ -1930,7 +1903,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [
     {
@@ -1942,13 +1915,13 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 33,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(lambda row: f(row['a'], row['b']))"
+    "df.apply(f, axis=1)"
    ]
   },
   {
@@ -1973,7 +1946,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -2024,14 +1997,14 @@
        "2  5"
       ]
      },
-     "execution_count": 35,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "@nulludf\n",
-    "def f(x):\n",
+    "def f(row):\n",
+    "    x = row['a']\n",
     "    if x > 3:\n",
     "            return x\n",
     "    else:\n",
@@ -2045,7 +2018,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -2057,13 +2030,13 @@
        "dtype: float64"
       ]
      },
-     "execution_count": 36,
+     "execution_count": 35,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(lambda row: f(row['a']))"
+    "df.apply(f, axis=1)"
    ]
   },
   {
@@ -2075,7 +2048,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -2142,16 +2115,14 @@
        "2  3  6     4  8  6"
       ]
      },
-     "execution_count": 37,
+     "execution_count": 36,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "\n",
-    "@nulludf\n",
-    "def f(v, w, x, y, z):\n",
-    "    return x + (y - (z / w)) % v\n",
+    "def f(row):\n",
+    "    return row['a'] + (row['b'] - (row['c'] / row['d'])) % row['e']\n",
     "\n",
     "df = cudf.DataFrame({\n",
     "    'a': [1, 2, 3],\n",
@@ -2165,40 +2136,117 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0    <NA>\n",
-       "1     4.8\n",
-       "2     5.0\n",
+       "0           <NA>\n",
+       "1    2.428571429\n",
+       "2            8.5\n",
        "dtype: float64"
       ]
      },
-     "execution_count": 38,
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.apply(f, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## `cudf.Series.apply`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "cuDF provides a similar API to `pandas.Series.apply` for applying scalar UDFs to series objects. Like pandas, these UDFs do not need to be written in terms of rows. These UDFs have generalized null handling and are slightly more flexible than those that work with `applymap`. Ultimately, `applymap` will be deprecated and removed in favor of `apply`. Here is an example: "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a cuDF series\n",
+    "sr = cudf.Series([1, cudf.NA, 3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# define a scalar function\n",
+    "def f(x):\n",
+    "    if x is cudf.NA:\n",
+    "        return 42\n",
+    "    else:\n",
+    "        return 2**x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0     2\n",
+       "1    42\n",
+       "2     8\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sr.apply(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0     2\n",
+       "1    42\n",
+       "2     8\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 41,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df.apply(\n",
-    "    lambda row: f(\n",
-    "            row['a'],\n",
-    "            row['b'],\n",
-    "            row['c'],\n",
-    "            row['d'],\n",
-    "            row['e']\n",
-    "    )\n",
-    ")"
+    "# Check the pandas result\n",
+    "sr.to_pandas(nullable=True).apply(f)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Caveats"
+    "## Caveats"
    ]
   },
   {
@@ -2206,16 +2254,7 @@
    "metadata": {},
    "source": [
     "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n",
-    "- Due to some more recent CUDA features being leveraged in the pipeline, support for CUDA 11.0 is currently unavailable. In particular, the 11.1+ toolkit will be needed, else the API will raise.\n",
-    "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented.\n",
-    "- Due to limitations in the Numba's output is currently runtime compiled, we can't yet support certain functions:\n",
-    "    - `pow`\n",
-    "    - `sin`\n",
-    "    - `cos`\n",
-    "    - `tan`\n",
-    "  \n",
-    "  Attempting to use these functions inside a UDF will result in an NVRTC error.\n",
-    "  "
+    "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented."
    ]
   },
   {
@@ -2255,7 +2294,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.8.12"
   }
  },
  "nbformat": 4,
diff --git a/java/pom.xml b/java/pom.xml
index db79f94009b..356d94455c8 100755
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -158,6 +158,7 @@
         <maven.compiler.target>1.8</maven.compiler.target>
         <junit.version>5.4.2</junit.version>
         <ai.rapids.refcount.debug>false</ai.rapids.refcount.debug>
+        <ai.rapids.cudf.nvtx.enabled>false</ai.rapids.cudf.nvtx.enabled>
         <native.build.path>${basedir}/target/cmake-build</native.build.path>
         <skipNativeCopy>false</skipNativeCopy>
         <cxx.flags/>
@@ -489,6 +490,7 @@
                   <redirectTestOutputToFile>true</redirectTestOutputToFile>
                   <systemPropertyVariables>
                     <ai.rapids.refcount.debug>${ai.rapids.refcount.debug}</ai.rapids.refcount.debug>
+                    <ai.rapids.cudf.nvtx.enabled>${ai.rapids.cudf.nvtx.enabled}</ai.rapids.cudf.nvtx.enabled>
                   </systemPropertyVariables>
                 </configuration>
             </plugin>
diff --git a/java/src/main/java/ai/rapids/cudf/CloseableArray.java b/java/src/main/java/ai/rapids/cudf/CloseableArray.java
new file mode 100644
index 00000000000..5c75f2378e8
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CloseableArray.java
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/** Utility class that wraps an array of closeable instances and can be closed */
+public class CloseableArray<T extends AutoCloseable> implements AutoCloseable {
+  private T[] array;
+
+  public static <T extends AutoCloseable> CloseableArray<T> wrap(T[] array) {
+    return new CloseableArray<T>(array);
+  }
+
+  CloseableArray(T[] array) {
+    this.array = array;
+  }
+
+  public int size() {
+    return array.length;
+  }
+
+  public T get(int i) {
+    return array[i];
+  }
+
+  public T set(int i, T obj) {
+    array[i] = obj;
+    return obj;
+  }
+
+  public T[] getArray() {
+    return array;
+  }
+
+  public T[] release() {
+    T[] result = array;
+    array = null;
+    return result;
+  }
+
+  public void closeAt(int i) {
+    try {
+      T toClose = array[i];
+      array[i] = null;
+      toClose.close();
+    } catch (RuntimeException e) {
+      throw e;
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public void close() {
+    close(null);
+  }
+
+  public void close(Exception pendingError) {
+    if (array == null) {
+      return;
+    }
+    T[] toClose = array;
+    array = null;
+    RuntimeException error = null;
+    if (pendingError instanceof RuntimeException) {
+      error = (RuntimeException) pendingError;
+    } else if (pendingError != null) {
+      error = new RuntimeException(pendingError);
+    }
+    for (T obj: toClose) {
+      if (obj != null) {
+        try {
+          obj.close();
+        } catch (RuntimeException e) {
+          if (error != null) {
+            error.addSuppressed(e);
+          } else {
+            error = e;
+          }
+        } catch (Exception e) {
+          if (error != null) {
+            error.addSuppressed(e);
+          } else {
+            error = new RuntimeException(e);
+          }
+        }
+      }
+    }
+    if (error != null) {
+      throw error;
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
similarity index 75%
rename from java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
rename to java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
index 229cb0262d3..0e49636fae6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetColumnWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnWriterOptions.java
@@ -22,38 +22,41 @@
 import java.util.List;
 
 /**
- * Per column settings for writing Parquet files.
+ * Per column settings for writing Parquet/ORC files.
+ *
+ * The native also uses the same "column_in_metadata" for both Parquet and ORC.
  */
-public class ParquetColumnWriterOptions {
+public class ColumnWriterOptions {
+  // `isTimestampTypeInt96` is ignored in ORC
   private boolean isTimestampTypeInt96;
   private int precision;
   private boolean isNullable;
   private boolean isMap = false;
-  private String columName;
-  private ParquetColumnWriterOptions(AbstractStructBuilder builder) {
-    this.columName = builder.name;
+  private String columnName;
+  private ColumnWriterOptions(AbstractStructBuilder builder) {
+    this.columnName = builder.name;
     this.isNullable = builder.isNullable;
     this.childColumnOptions =
-        (ParquetColumnWriterOptions[]) builder.children.toArray(new ParquetColumnWriterOptions[0]);
+        (ColumnWriterOptions[]) builder.children.toArray(new ColumnWriterOptions[0]);
   }
 
   /**
    * Constructor used for list
    */
-  private ParquetColumnWriterOptions(ListBuilder builder) {
+  private ColumnWriterOptions(ListBuilder builder) {
     assert(builder.children.size() == 1) : "Lists can only have one child";
-    this.columName = builder.name;
+    this.columnName = builder.name;
     this.isNullable = builder.isNullable;
     // we are adding the child twice even though lists have one child only because the way the cudf
     // has implemented this it requires two children to be set for the list, but it drops the
     // first one. This is something that is a lower priority and might be fixed in future
     this.childColumnOptions =
-        new ParquetColumnWriterOptions[]{DUMMY_CHILD, builder.children.get(0)};
+        new ColumnWriterOptions[]{DUMMY_CHILD, builder.children.get(0)};
   }
 
-  protected ParquetColumnWriterOptions[] childColumnOptions = {};
+  protected ColumnWriterOptions[] childColumnOptions = {};
   protected abstract static class AbstractStructBuilder<T extends AbstractStructBuilder,
-      V extends ParquetColumnWriterOptions> extends NestedBuilder<T, V> {
+      V extends ColumnWriterOptions> extends NestedBuilder<T, V> {
     /**
      * Builder specific to build a Struct meta
      */
@@ -72,10 +75,10 @@ protected AbstractStructBuilder() {
   // https://github.com/rapidsai/cudf/pull/7461/commits/5ce33b40abb87cc7b76b5efeb0a3a0215f9ef6fb
   // but it was reverted later on here:
   // https://github.com/rapidsai/cudf/pull/7461/commits/f248eb7265de995a95f998d46d897fb0ae47f53e
-  static ParquetColumnWriterOptions DUMMY_CHILD = new ParquetColumnWriterOptions("DUMMY");
+  static ColumnWriterOptions DUMMY_CHILD = new ColumnWriterOptions("DUMMY");
 
-  public static abstract class NestedBuilder<T extends NestedBuilder, V extends ParquetColumnWriterOptions> {
-    protected List<ParquetColumnWriterOptions> children = new ArrayList<>();
+  public static abstract class NestedBuilder<T extends NestedBuilder, V extends ColumnWriterOptions> {
+    protected List<ColumnWriterOptions> children = new ArrayList<>();
     protected boolean isNullable = true;
     protected String name = "";
 
@@ -89,34 +92,34 @@ protected NestedBuilder(String name, boolean isNullable) {
 
     protected NestedBuilder() {}
 
-    protected ParquetColumnWriterOptions withColumns(String name, boolean isNullable) {
-      return new ParquetColumnWriterOptions(name, isNullable);
+    protected ColumnWriterOptions withColumns(String name, boolean isNullable) {
+      return new ColumnWriterOptions(name, isNullable);
     }
 
-    protected ParquetColumnWriterOptions withDecimal(String name, int precision,
-                                                     boolean isNullable) {
-      return new ParquetColumnWriterOptions(name, false, precision, isNullable);
+    protected ColumnWriterOptions withDecimal(String name, int precision,
+                                              boolean isNullable) {
+      return new ColumnWriterOptions(name, false, precision, isNullable);
     }
 
-    protected ParquetColumnWriterOptions withTimestamp(String name, boolean isInt96,
-                                                       boolean isNullable) {
-      return new ParquetColumnWriterOptions(name, isInt96, 0, isNullable);
+    protected ColumnWriterOptions withTimestamp(String name, boolean isInt96,
+                                                boolean isNullable) {
+      return new ColumnWriterOptions(name, isInt96, 0, isNullable);
     }
 
     /**
      * Set the list column meta.
      * Lists should have only one child in ColumnVector, but the metadata expects a
      * LIST column to have two children and the first child to be the
-     * {@link ParquetColumnWriterOptions#DUMMY_CHILD}.
+     * {@link ColumnWriterOptions#DUMMY_CHILD}.
      * This is the current behavior in cudf and will change in future
      * @return this for chaining.
      */
-    public T withListColumn(ParquetListColumnWriterOptions child) {
+    public T withListColumn(ListColumnWriterOptions child) {
       assert (child.getChildColumnOptions().length == 2) : "Lists can only have two children";
       if (child.getChildColumnOptions()[0] != DUMMY_CHILD) {
         throw new IllegalArgumentException("First child in the list has to be DUMMY_CHILD");
       }
-      if (child.getChildColumnOptions()[1].getColumName().isEmpty()) {
+      if (child.getChildColumnOptions()[1].getColumnName().isEmpty()) {
         throw new IllegalArgumentException("Column name can't be empty");
       }
       children.add(child);
@@ -127,7 +130,7 @@ public T withListColumn(ParquetListColumnWriterOptions child) {
      * Set the map column meta.
      * @return this for chaining.
      */
-    public T withMapColumn(ParquetColumnWriterOptions child) {
+    public T withMapColumn(ColumnWriterOptions child) {
       children.add(child);
       return (T) this;
     }
@@ -136,9 +139,9 @@ public T withMapColumn(ParquetColumnWriterOptions child) {
      * Set a child struct meta data
      * @return this for chaining.
      */
-    public T withStructColumn(ParquetStructColumnWriterOptions child) {
-      for (ParquetColumnWriterOptions opt: child.getChildColumnOptions()) {
-        if (opt.getColumName().isEmpty()) {
+    public T withStructColumn(StructColumnWriterOptions child) {
+      for (ColumnWriterOptions opt: child.getChildColumnOptions()) {
+        if (opt.getColumnName().isEmpty()) {
           throw new IllegalArgumentException("Column name can't be empty");
         }
       }
@@ -230,33 +233,33 @@ public T withNullableTimestampColumn(String name, boolean isInt96) {
     public abstract V build();
   }
 
-  public ParquetColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
-                                    int precision, boolean isNullable) {
+  public ColumnWriterOptions(String columnName, boolean isTimestampTypeInt96,
+                             int precision, boolean isNullable) {
     this.isTimestampTypeInt96 = isTimestampTypeInt96;
     this.precision = precision;
     this.isNullable = isNullable;
-    this.columName = columnName;
+    this.columnName = columnName;
   }
 
-  public ParquetColumnWriterOptions(String columnName, boolean isNullable) {
+  public ColumnWriterOptions(String columnName, boolean isNullable) {
     this.isTimestampTypeInt96 = false;
     this.precision = 0;
     this.isNullable = isNullable;
-    this.columName = columnName;
+    this.columnName = columnName;
   }
 
-  public ParquetColumnWriterOptions(String columnName) {
+  public ColumnWriterOptions(String columnName) {
     this(columnName, true);
   }
 
   @FunctionalInterface
   protected interface ByteArrayProducer {
-    boolean[] apply(ParquetColumnWriterOptions opt);
+    boolean[] apply(ColumnWriterOptions opt);
   }
 
   @FunctionalInterface
   protected interface IntArrayProducer {
-    int[] apply(ParquetColumnWriterOptions opt);
+    int[] apply(ColumnWriterOptions opt);
   }
 
   boolean[] getFlatIsTimeTypeInt96() {
@@ -272,7 +275,7 @@ protected boolean[] getFlatBooleans(boolean[] ret, ByteArrayProducer producer) {
     boolean[][] childResults = new boolean[childColumnOptions.length][];
     int totalChildrenFlatLength = ret.length;
     for (int i = 0 ; i < childColumnOptions.length ; i++) {
-      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      ColumnWriterOptions opt = childColumnOptions[i];
       childResults[i] = producer.apply(opt);
       totalChildrenFlatLength += childResults[i].length;
     }
@@ -327,7 +330,7 @@ protected int[] getFlatInts(int[] ret, IntArrayProducer producer) {
     int[][] childResults = new int[childColumnOptions.length][];
     int totalChildrenFlatLength = ret.length;
     for (int i = 0 ; i < childColumnOptions.length ; i++) {
-      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      ColumnWriterOptions opt = childColumnOptions[i];
       childResults[i] = producer.apply(opt);
       totalChildrenFlatLength += childResults[i].length;
     }
@@ -343,7 +346,7 @@ protected int[] getFlatInts(int[] ret, IntArrayProducer producer) {
   }
 
   String[] getFlatColumnNames() {
-    String[] ret = {columName};
+    String[] ret = {columnName};
     if (childColumnOptions.length > 0) {
       return getFlatColumnNames(ret);
     } else {
@@ -355,7 +358,7 @@ protected String[] getFlatColumnNames(String[] ret) {
     String[][] childResults = new String[childColumnOptions.length][];
     int totalChildrenFlatLength = ret.length;
     for (int i = 0 ; i < childColumnOptions.length ; i++) {
-      ParquetColumnWriterOptions opt = childColumnOptions[i];
+      ColumnWriterOptions opt = childColumnOptions[i];
       childResults[i] = opt.getFlatColumnNames();
       totalChildrenFlatLength += childResults[i].length;
     }
@@ -377,14 +380,14 @@ protected String[] getFlatColumnNames(String[] ret) {
    * named 'value'. The caller of this method doesn't need to worry about this as this method will
    * take care of this without the knowledge of the caller.
    */
-  public static ParquetColumnWriterOptions mapColumn(String name, ParquetColumnWriterOptions key,
-                                                     ParquetColumnWriterOptions value) {
-    ParquetStructColumnWriterOptions struct = structBuilder("key_value").build();
+  public static ColumnWriterOptions mapColumn(String name, ColumnWriterOptions key,
+                                              ColumnWriterOptions value) {
+    StructColumnWriterOptions struct = structBuilder("key_value").build();
     if (key.isNullable) {
       throw new IllegalArgumentException("key column can not be nullable");
     }
-    struct.childColumnOptions = new ParquetColumnWriterOptions[]{key, value};
-    ParquetColumnWriterOptions opt = listBuilder(name)
+    struct.childColumnOptions = new ColumnWriterOptions[]{key, value};
+    ColumnWriterOptions opt = listBuilder(name)
         .withStructColumn(struct)
         .build();
     opt.isMap = true;
@@ -422,8 +425,8 @@ public static StructBuilder structBuilder(String name) {
   /**
    * Return if the column can have null values
    */
-  public String getColumName() {
-    return columName;
+  public String getColumnName() {
+    return columnName;
   }
 
   /**
@@ -450,39 +453,39 @@ public boolean isTimestampTypeInt96() {
   /**
    * Return the child columnOptions for this column
    */
-  public ParquetColumnWriterOptions[] getChildColumnOptions() {
+  public ColumnWriterOptions[] getChildColumnOptions() {
     return childColumnOptions;
   }
 
-  public static class ParquetStructColumnWriterOptions extends ParquetColumnWriterOptions {
-    protected ParquetStructColumnWriterOptions(AbstractStructBuilder builder) {
+  public static class StructColumnWriterOptions extends ColumnWriterOptions {
+    protected StructColumnWriterOptions(AbstractStructBuilder builder) {
       super(builder);
     }
   }
 
-  public static class ParquetListColumnWriterOptions extends ParquetColumnWriterOptions {
-    protected ParquetListColumnWriterOptions(ListBuilder builder) {
+  public static class ListColumnWriterOptions extends ColumnWriterOptions {
+    protected ListColumnWriterOptions(ListBuilder builder) {
       super(builder);
     }
   }
 
-  public static class StructBuilder extends AbstractStructBuilder<StructBuilder, ParquetStructColumnWriterOptions> {
+  public static class StructBuilder extends AbstractStructBuilder<StructBuilder, StructColumnWriterOptions> {
     public StructBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
-    public ParquetStructColumnWriterOptions build() {
-      return new ParquetStructColumnWriterOptions(this);
+    public StructColumnWriterOptions build() {
+      return new StructColumnWriterOptions(this);
     }
   }
 
-  public static class ListBuilder extends NestedBuilder<ListBuilder, ParquetListColumnWriterOptions> {
+  public static class ListBuilder extends NestedBuilder<ListBuilder, ListColumnWriterOptions> {
     public ListBuilder(String name, boolean isNullable) {
       super(name, isNullable);
     }
 
-    public ParquetListColumnWriterOptions build() {
-      return new ParquetListColumnWriterOptions(this);
+    public ListColumnWriterOptions build() {
+      return new ListColumnWriterOptions(this);
     }
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
new file mode 100644
index 00000000000..9292975d0ce
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/CompressionMetadataWriterOptions.java
@@ -0,0 +1,113 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+public class CompressionMetadataWriterOptions extends ColumnWriterOptions.StructColumnWriterOptions {
+  private final CompressionType compressionType;
+  private final Map<String, String> metadata;
+
+  protected CompressionMetadataWriterOptions(Builder builder) {
+    super(builder);
+    this.compressionType = builder.compressionType;
+    this.metadata = builder.metadata;
+  }
+
+  @Override
+  boolean[] getFlatIsTimeTypeInt96() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsTimeTypeInt96());
+  }
+
+  @Override
+  int[] getFlatPrecision() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision());
+  }
+
+  @Override
+  int[] getFlatNumChildren() {
+    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren());
+  }
+
+  @Override
+  boolean[] getFlatIsNullable() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsNullable());
+  }
+
+  @Override
+  boolean[] getFlatIsMap() {
+    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsMap());
+  }
+
+  @Override
+  String[] getFlatColumnNames() {
+    return super.getFlatColumnNames(new String[]{});
+  }
+
+  String[] getMetadataKeys() {
+    return metadata.keySet().toArray(new String[metadata.size()]);
+  }
+
+  String[] getMetadataValues() {
+    return metadata.values().toArray(new String[metadata.size()]);
+  }
+
+  public CompressionType getCompressionType() {
+    return compressionType;
+  }
+
+  public Map<String, String> getMetadata() {
+    return metadata;
+  }
+
+  public int getTopLevelChildren() {
+    return childColumnOptions.length;
+  }
+
+  public abstract static class Builder<T extends Builder,
+        V extends CompressionMetadataWriterOptions> extends AbstractStructBuilder<T, V> {
+    final Map<String, String> metadata = new LinkedHashMap<>();
+    CompressionType compressionType = CompressionType.AUTO;
+
+    /**
+     * Add a metadata key and a value
+     */
+    public T withMetadata(String key, String value) {
+      this.metadata.put(key, value);
+      return (T) this;
+    }
+
+    /**
+     * Add a map of metadata keys and values
+     */
+    public T withMetadata(Map<String, String> metadata) {
+      this.metadata.putAll(metadata);
+      return (T) this;
+    }
+
+    /**
+     * Set the compression type to use for writing
+     */
+    public T withCompressionType(CompressionType compression) {
+      this.compressionType = compression;
+      return (T) this;
+    }
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index 02e4d32617d..5e3722d50b7 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -15,6 +15,9 @@
  */
 package ai.rapids.cudf;
 
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -521,4 +524,40 @@ public static synchronized boolean isEnvCompatibleForTesting() {
    * Whether per-thread default stream is enabled.
    */
   public static native boolean isPtdsEnabled();
+
+  /**
+   * Copy data from multiple device buffer sources to multiple device buffer destinations.
+   * For each buffer to copy there is a corresponding entry in the destination address, source
+   * address, and copy size vectors.
+   * @param destAddrs vector of device destination addresses
+   * @param srcAddrs vector of device source addresses
+   * @param copySizes vector of copy sizes
+   * @param stream CUDA stream to use for the copy
+   */
+  public static void multiBufferCopyAsync(long [] destAddrs,
+                                          long [] srcAddrs,
+                                          long [] copySizes,
+                                          Stream stream) {
+    // Temporary sub-par stand-in for a multi-buffer copy CUDA kernel
+    assert(destAddrs.length == srcAddrs.length);
+    assert(copySizes.length == destAddrs.length);
+    try (NvtxRange copyRange = new NvtxRange("multiBufferCopyAsync", NvtxColor.CYAN)){
+      for (int i = 0; i < destAddrs.length; i++) {
+        asyncMemcpy(destAddrs[i], srcAddrs[i], copySizes[i], CudaMemcpyKind.DEVICE_TO_DEVICE, stream);
+      }
+    }
+  }
+  /**
+   * Begins an Nsight profiling session, if a profiler is currently attached.
+   * @note if a profiler session has a already started, `profilerStart` has
+   * no effect.
+   */
+  public static native void profilerStart();
+
+  /**
+   * Stops an active Nsight profiling session.
+   * @note if a profiler session isn't active, `profilerStop` has
+   * no effect.
+   */
+  public static native void profilerStop();
 }
diff --git a/java/src/main/java/ai/rapids/cudf/CudaMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/CudaMemoryBuffer.java
index 1771c0217ae..c62ae8af82d 100644
--- a/java/src/main/java/ai/rapids/cudf/CudaMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/CudaMemoryBuffer.java
@@ -72,7 +72,15 @@ public boolean isClean() {
     }
   }
 
-  CudaMemoryBuffer(long address, long lengthInBytes, Cuda.Stream stream) {
+  /**
+   * Wrap an existing CUDA allocation in a device memory buffer. The CUDA allocation will be freed
+   * when the resulting device memory buffer instance frees its memory resource (i.e.: when its
+   * reference count goes to zero).
+   * @param address device address of the CUDA memory allocation
+   * @param lengthInBytes length of the CUDA allocation in bytes
+   * @param stream CUDA stream to use for synchronization when freeing the allocation
+   */
+  public CudaMemoryBuffer(long address, long lengthInBytes, Cuda.Stream stream) {
     super(address, lengthInBytes, new CudaBufferCleaner(address, lengthInBytes, stream));
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
index fa888625d47..c4d9bdb8f91 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -99,8 +99,16 @@ public boolean isClean() {
     }
   }
 
-  // Static factory method to make this a little simpler from JNI
-  static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long rmmBufferAddress) {
+  /**
+   * Wrap an existing RMM allocation in a device memory buffer. The RMM allocation will be freed
+   * when the resulting device memory buffer instance frees its memory resource (i.e.: when its
+   * reference count goes to zero).
+   * @param address device address of the RMM allocation
+   * @param lengthInBytes length of the RMM allocation in bytes
+   * @param rmmBufferAddress host address of the rmm::device_buffer that owns the device memory
+   * @return new device memory buffer instance that wraps the existing RMM allocation
+   */
+  public static DeviceMemoryBuffer fromRmm(long address, long lengthInBytes, long rmmBufferAddress) {
     return new DeviceMemoryBuffer(address, lengthInBytes, rmmBufferAddress);
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 6c52b8fe798..b1320e839cd 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -29,6 +29,8 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
 
 /**
  * Serialize and deserialize CUDF tables and columns using a custom format.  The goal of this is
@@ -1660,6 +1662,60 @@ public static void writeConcatedStream(SerializedTableHeader[] headers,
   // COLUMN AND TABLE READ
   /////////////////////////////////////////////
 
+  private static HostColumnVectorCore buildHostColumn(SerializedColumnHeader column,
+                                                      ArrayDeque<ColumnOffsets> columnOffsets,
+                                                      HostMemoryBuffer buffer,
+                                                      boolean isRootColumn) {
+    ColumnOffsets offsetsInfo = columnOffsets.remove();
+    SerializedColumnHeader[] children = column.getChildren();
+    int numChildren = children != null ? children.length : 0;
+    List<HostColumnVectorCore> childColumns = new ArrayList<>(numChildren);
+    try {
+      if (children != null) {
+        for (SerializedColumnHeader child : children) {
+          childColumns.add(buildHostColumn(child, columnOffsets, buffer, false));
+        }
+      }
+      DType dtype = column.getType();
+      long rowCount = column.getRowCount();
+      long nullCount = column.getNullCount();
+      HostMemoryBuffer dataBuffer = null;
+      HostMemoryBuffer validityBuffer = null;
+      HostMemoryBuffer offsetsBuffer = null;
+      if (!dtype.isNestedType()) {
+        dataBuffer = buffer.slice(offsetsInfo.data, offsetsInfo.dataLen);
+      }
+      if (nullCount > 0) {
+        long validitySize = BitVectorHelper.getValidityLengthInBytes(rowCount);
+        validityBuffer = buffer.slice(offsetsInfo.validity, validitySize);
+      }
+      if (dtype.hasOffsets()) {
+        // one 32-bit integer offset per row plus one additional offset at the end
+        long offsetsSize = rowCount > 0 ? (rowCount + 1) * Integer.BYTES : 0;
+        offsetsBuffer = buffer.slice(offsetsInfo.offsets, offsetsSize);
+      }
+      HostColumnVectorCore result;
+      // Only creates HostColumnVector for root columns, since child columns are managed by their parents.
+      if (isRootColumn) {
+        result = new HostColumnVector(dtype, column.getRowCount(),
+            Optional.of(column.getNullCount()), dataBuffer, validityBuffer, offsetsBuffer,
+            childColumns);
+      } else {
+        result = new HostColumnVectorCore(dtype, column.getRowCount(),
+            Optional.of(column.getNullCount()), dataBuffer, validityBuffer, offsetsBuffer,
+            childColumns);
+      }
+      childColumns = null;
+      return result;
+    } finally {
+      if (childColumns != null) {
+        for (HostColumnVectorCore c : childColumns) {
+          c.close();
+        }
+      }
+    }
+  }
+
   private static long buildColumnView(SerializedColumnHeader column,
                                       ArrayDeque<ColumnOffsets> columnOffsets,
                                       DeviceMemoryBuffer combinedBuffer) {
@@ -1769,6 +1825,38 @@ public static HostConcatResult concatToHostBuffer(SerializedTableHeader[] header
     }
   }
 
+  /**
+   * Deserialize a serialized contiguous table into an array of host columns.
+   *
+   * @param header     serialized table header
+   * @param hostBuffer buffer containing the data for all columns in the serialized table
+   * @return array of host columns representing the data from the serialized table
+   */
+  public static HostColumnVector[] unpackHostColumnVectors(SerializedTableHeader header,
+                                                           HostMemoryBuffer hostBuffer) {
+    ArrayDeque<ColumnOffsets> columnOffsets = buildIndex(header, hostBuffer);
+    int numColumns = header.getNumColumns();
+    HostColumnVector[] columns = new HostColumnVector[numColumns];
+    boolean succeeded = false;
+    try {
+      for (int i = 0; i < numColumns; i++) {
+        SerializedColumnHeader column = header.getColumnHeader(i);
+        columns[i] = (HostColumnVector) buildHostColumn(column, columnOffsets, hostBuffer, true);
+      }
+      assert columnOffsets.isEmpty();
+      succeeded = true;
+    } finally {
+      if (!succeeded) {
+        for (HostColumnVector c : columns) {
+          if (c != null) {
+            c.close();
+          }
+        }
+      }
+    }
+    return columns;
+  }
+
   /**
    * After reading a header for a table read the data portion into a host side buffer.
    * @param in the stream to read the data from.
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
index a936d4830ee..05545807bb6 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryCleaner.java
@@ -19,8 +19,6 @@
 package ai.rapids.cudf;
 
 import ai.rapids.cudf.ast.CompiledExpression;
-import ai.rapids.cudf.nvcomp.BatchedLZ4Decompressor;
-import ai.rapids.cudf.nvcomp.Decompressor;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -248,16 +246,6 @@ static void register(Cuda.Event event, Cleaner cleaner) {
     all.add(new CleanerWeakReference(event, cleaner, collected, false));
   }
 
-  public static void register(Decompressor.Metadata metadata, Cleaner cleaner) {
-    // It is now registered...
-    all.add(new CleanerWeakReference(metadata, cleaner, collected, false));
-  }
-
-  public static void register(BatchedLZ4Decompressor.BatchedMetadata metadata, Cleaner cleaner) {
-    // It is now registered...
-    all.add(new CleanerWeakReference(metadata, cleaner, collected, false));
-  }
-
   static void register(CuFileDriver driver, Cleaner cleaner) {
     // It is now registered...
     all.add(new CleanerWeakReference(driver, cleaner, collected, false));
@@ -324,4 +312,4 @@ public String toString() {
           + "\n";
     }
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/main/java/ai/rapids/cudf/NvtxRange.java b/java/src/main/java/ai/rapids/cudf/NvtxRange.java
index 739ca71d738..813ab5e66cf 100644
--- a/java/src/main/java/ai/rapids/cudf/NvtxRange.java
+++ b/java/src/main/java/ai/rapids/cudf/NvtxRange.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 package ai.rapids.cudf;
 
 /**
- * Utility class to mark an NVTX profiling range.
+ * This class supports push/pop NVTX profiling ranges, or "scoped" ranges.
  *
  * The constructor pushes an NVTX range and the close method pops off the most recent range that
  * was pushed. Therefore instances of this class should always be used in a try-with-resources
@@ -33,9 +33,14 @@
  *
  * Instances should be associated with a single thread to avoid pushing an NVTX range in
  * one thread and then trying to pop the range in a different thread.
+ *
+ * Push/pop ranges show a stacking behavior in tools such as Nsight, where newly pushed 
+ * ranges are correlated and enclosed by the prior pushed range (in the example above,
+ * "b" is enclosed by "a").
  */
 public class NvtxRange implements AutoCloseable {
   private static final boolean isEnabled = Boolean.getBoolean("ai.rapids.cudf.nvtx.enabled");
+
   static {
     if (isEnabled) {
       NativeDepsLoader.loadNativeDeps();
diff --git a/java/src/main/java/ai/rapids/cudf/NvtxUniqueRange.java b/java/src/main/java/ai/rapids/cudf/NvtxUniqueRange.java
new file mode 100644
index 00000000000..02322902667
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/NvtxUniqueRange.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package ai.rapids.cudf;
+
+/**
+ * This class supports start/end NVTX profiling ranges.
+ *
+ * Start/end:
+ *
+ * The constructor instantiates a new NVTX range and keeps a unique handle that comes back
+ * from the NVTX api (nvtxRangeId). The handle is used to later close such a range. This type
+ * of range does not have the same order-of-operation requirements that the push/pop ranges have:
+ * the `NvtxUniqueRange` instance can be passed to other scopes, and even to other threads
+ * for the eventual call to close.
+ *
+ * It can be used in the same try-with-resources way as push/pop, or interleaved with other
+ * ranges, like so:
+ *
+ * <pre>
+ *   NvtxUniqueRange a = new NvtxUniqueRange("a", NvtxColor.RED);
+ *   NvtxUniqueRange b = new NvtxUniqueRange("b", NvtxColor.BLUE);
+ *   a.close();
+ *   b.close();
+ * </pre>
+ */
+public class NvtxUniqueRange implements AutoCloseable {
+  private static final boolean isEnabled = Boolean.getBoolean("ai.rapids.cudf.nvtx.enabled");
+
+  // this is a nvtxRangeId_t in the C++ api side
+  private final long nvtxRangeId;
+
+  // true if this range is already closed
+  private boolean closed;
+
+  static {
+    if (isEnabled) {
+      NativeDepsLoader.loadNativeDeps();
+    }
+  }
+
+  public NvtxUniqueRange(String name, NvtxColor color) {
+    this(name, color.colorBits);
+  }
+
+  public NvtxUniqueRange(String name, int colorBits) {
+    if (isEnabled) {
+      nvtxRangeId = start(name, colorBits);
+    } else {
+      // following the implementation in nvtx3, the default value of 0
+      // is given when NVTX is disabled
+      nvtxRangeId = 0;
+    }
+  }
+
+  @Override
+  public synchronized void close() {
+    if (closed) {
+      throw new IllegalStateException(
+          "Cannot call close on an already closed NvtxUniqueRange!");
+    }
+    closed = true;
+    if (isEnabled) {
+      end(this.nvtxRangeId);
+    }
+  }
+
+  private native long start(String name, int colorBits);
+  private native void end(long nvtxRangeId);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
index 85443c3ae0f..372f919532e 100644
--- a/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ORCWriterOptions.java
@@ -18,7 +18,11 @@
 
 package ai.rapids.cudf;
 
-public class ORCWriterOptions extends CompressedMetadataWriterOptions {
+/**
+ * This class represents settings for writing ORC files. It includes meta data information
+ * that will be used by the ORC writer to write the file.
+ */
+public class ORCWriterOptions extends CompressionMetadataWriterOptions {
 
   private ORCWriterOptions(Builder builder) {
     super(builder);
@@ -28,7 +32,9 @@ public static Builder builder() {
     return new Builder();
   }
 
-  public static class Builder extends CMWriterBuilder<Builder> {
+  public static class Builder extends CompressionMetadataWriterOptions.Builder
+          <Builder, ORCWriterOptions> {
+
     public ORCWriterOptions build() {
       return new ORCWriterOptions(this);
     }
diff --git a/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java b/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java
new file mode 100644
index 00000000000..36f39aa8ad3
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java
@@ -0,0 +1,39 @@
+/*
+ *
+ *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Policy to account for possible out-of-bounds indices
+ *
+ * `NULLIFY` means to nullify output values corresponding to out-of-bounds gather map values.
+ *
+ * `DONT_CHECK` means do not check whether the indices are out-of-bounds, for better
+ *   performance. Use `DONT_CHECK` carefully, as it can result in a CUDA exception if
+ *   the gather map values are actually out of range.
+ *
+ * @note This enum doesn't have a nativeId because the C++ out_of_bounds_policy is a
+ *        a boolean enum. It is just added for clarity in the Java API.
+ */
+public enum OutOfBoundsPolicy {
+  /* Output values corresponding to out-of-bounds indices are null */
+  NULLIFY,  
+
+  /* No bounds checking is performed, better performance */
+  DONT_CHECK
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 38f8d8e59a4..7b58817550d 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -18,61 +18,16 @@
 
 package ai.rapids.cudf;
 
-import java.util.LinkedHashMap;
-import java.util.Map;
-
 /**
  * This class represents settings for writing Parquet files. It includes meta data information
  * that will be used by the Parquet writer to write the file
  */
-public final class ParquetWriterOptions extends ParquetColumnWriterOptions.ParquetStructColumnWriterOptions {
-  private final CompressionType compressionType;
-  private final Map<String, String> metadata;
+public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
     this.statsGranularity = builder.statsGranularity;
-    this.compressionType = builder.compressionType;
-    this.metadata = builder.metadata;
-  }
-
-  @Override
-  boolean[] getFlatIsTimeTypeInt96() {
-    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsTimeTypeInt96());
-  }
-
-  @Override
-  int[] getFlatPrecision() {
-    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatPrecision());
-  }
-
-  @Override
-  int[] getFlatNumChildren() {
-    return super.getFlatInts(new int[]{}, (opt) -> opt.getFlatNumChildren());
-  }
-
-  @Override
-  boolean[] getFlatIsNullable() {
-    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsNullable());
-  }
-
-  @Override
-  boolean[] getFlatIsMap() {
-    return super.getFlatBooleans(new boolean[]{}, (opt) -> opt.getFlatIsMap());
-  }
-
-  @Override
-  String[] getFlatColumnNames() {
-    return super.getFlatColumnNames(new String[]{});
-  }
-
-  String[] getMetadataKeys() {
-    return metadata.keySet().toArray(new String[metadata.size()]);
-  }
-
-  String[] getMetadataValues() {
-    return metadata.values().toArray(new String[metadata.size()]);
   }
 
   public enum StatisticsFrequency {
@@ -100,52 +55,14 @@ public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
-  public CompressionType getCompressionType() {
-    return compressionType;
-  }
-
-  public Map<String, String> getMetadata() {
-    return metadata;
-  }
-
-  public int getTopLevelChildren() {
-    return childColumnOptions.length;
-  }
-
-  public static class Builder extends ParquetColumnWriterOptions.AbstractStructBuilder<Builder,
-      ParquetWriterOptions> {
+  public static class Builder extends CompressionMetadataWriterOptions.Builder
+        <Builder, ParquetWriterOptions> {
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
-    final Map<String, String> metadata = new LinkedHashMap<>();
-    CompressionType compressionType = CompressionType.AUTO;
 
     public Builder() {
       super();
     }
 
-    /**
-     * Add a metadata key and a value
-     */
-    public Builder withMetadata(String key, String value) {
-      this.metadata.put(key, value);
-      return this;
-    }
-
-    /**
-     * Add a map of metadata keys and values
-     */
-    public Builder withMetadata(Map<String, String> metadata) {
-      this.metadata.putAll(metadata);
-      return this;
-    }
-
-    /**
-     * Set the compression type to use for writing
-     */
-    public Builder withCompressionType(CompressionType compression) {
-      this.compressionType = compression;
-      return this;
-    }
-
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index cf3cd721af9..730f82f0047 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -75,61 +75,6 @@ public static LogConf logToStderr() {
     return new LogConf(null, LogLoc.STDERR);
   }
 
-  /**
-   * Initialize memory manager state and storage. This will always initialize
-   * the CUDA context for the calling thread if it is not already set. The
-   * caller is responsible for setting the desired CUDA device prior to this
-   * call if a specific device is already set.
-   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
-   * context of the calling thread after this returns.
-   * @param allocationMode Allocation strategy to use. Bit set using
-   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
-   *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
-   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
-   * @param enableLogging  Enable logging memory manager events
-   * @param poolSize       The initial pool size in bytes
-   * @throws IllegalStateException if RMM has already been initialized
-   */
-  public static void initialize(int allocationMode, boolean enableLogging, long poolSize)
-      throws RmmException {
-    initialize(allocationMode, enableLogging, poolSize, 0);
-  }
-
-  /**
-   * Initialize memory manager state and storage. This will always initialize
-   * the CUDA context for the calling thread if it is not already set. The
-   * caller is responsible for setting the desired CUDA device prior to this
-   * call if a specific device is already set.
-   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
-   * context of the calling thread after this returns.
-   * @param allocationMode Allocation strategy to use. Bit set using
-   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
-   *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
-   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
-   * @param enableLogging  Enable logging memory manager events
-   * @param poolSize       The initial pool size in bytes
-   * @param maxPoolSize    The maximum size the pool is allowed to grow. If the specified value
-   *                       is <= 0 then the maximum pool size will not be artificially limited.
-   * @throws IllegalStateException if RMM has already been initialized
-   */
-  public static void initialize(int allocationMode, boolean enableLogging, long poolSize,
-      long maxPoolSize) throws RmmException {
-    LogConf lc = null;
-    if (enableLogging) {
-      String f = System.getenv("RMM_LOG_FILE");
-      if (f != null) {
-        lc = logTo(new File(f));
-      } else {
-        lc = logToStderr();
-      }
-    }
-    initialize(allocationMode, lc, poolSize, maxPoolSize);
-  }
-
   /**
    * Initialize memory manager state and storage. This will always initialize
    * the CUDA context for the calling thread if it is not already set. The
@@ -149,67 +94,6 @@ public static void initialize(int allocationMode, boolean enableLogging, long po
    */
   public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize)
       throws RmmException {
-    initialize(allocationMode, logConf, poolSize, 0);
-  }
-
-  /**
-   * Initialize memory manager state and storage. This will always initialize
-   * the CUDA context for the calling thread if it is not already set. The
-   * caller is responsible for setting the desired CUDA device prior to this
-   * call if a specific device is already set.
-   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
-   * context of the calling thread after this returns.
-   * @param allocationMode Allocation strategy to use. Bit set using
-   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
-   *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
-   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
-   * @param logConf        How to do logging or null if you don't want to
-   * @param poolSize       The initial pool size in bytes
-   * @param maxPoolSize    The maximum size the pool is allowed to grow. If the specified value
-   *                       is <= 0 then the pool size will not be artificially limited.
-   * @throws IllegalStateException if RMM has already been initialized
-   * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
-   *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA} or
-   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
-   *                                  size is below the initial size.
-   */
-  public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
-      long maxPoolSize) throws RmmException {
-    initialize(allocationMode, logConf, poolSize, maxPoolSize, 0, 0);
-  }
-
-  /**
-   * Initialize memory manager state and storage. This will always initialize
-   * the CUDA context for the calling thread if it is not already set. The
-   * caller is responsible for setting the desired CUDA device prior to this
-   * call if a specific device is already set.
-   * <p>NOTE: All cudf methods will set the chosen CUDA device in the CUDA
-   * context of the calling thread after this returns.
-   * @param allocationMode Allocation strategy to use. Bit set using
-   *                       {@link RmmAllocationMode#CUDA_DEFAULT},
-   *                       {@link RmmAllocationMode#POOL},
-   *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
-   *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
-   * @param logConf        How to do logging or null if you don't want to
-   * @param poolSize       The initial pool size in bytes
-   * @param maxPoolSize    The maximum size the pool is allowed to grow. If the specified value
-   *                       is <= 0 then the pool size will not be artificially limited.
-   * @param allocationAlignment The size to which allocations are aligned.
-   * @param alignmentThreshold  Only allocations with size larger than or equal to this threshold
-   *                            are aligned with `allocationAlignment`.
-   * @throws IllegalStateException if RMM has already been initialized
-   * @throws IllegalArgumentException if a max pool size is specified but the allocation mode
-   *                                  is not {@link RmmAllocationMode#POOL} or
-   *                                  {@link RmmAllocationMode#ARENA} or
-   *                                  {@link RmmAllocationMode#CUDA_ASYNC}, or the maximum pool
-   *                                  size is below the initial size.
-   */
-  public static synchronized void initialize(int allocationMode, LogConf logConf, long poolSize,
-      long maxPoolSize, long allocationAlignment, long alignmentThreshold) throws RmmException {
     if (initialized) {
       throw new IllegalStateException("RMM is already initialized");
     }
@@ -219,16 +103,6 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
-    if (maxPoolSize > 0) {
-      if (!isPool && !isArena && !isAsync) {
-        throw new IllegalArgumentException(
-            "Pool limit only supported in POOL, ARENA, or CUDA_ASYNC allocation mode");
-      }
-      if (maxPoolSize < poolSize) {
-        throw new IllegalArgumentException("Pool limit of " + maxPoolSize
-            + " is less than initial pool size of " + poolSize);
-      }
-    }
     if (isAsync && isManaged) {
       throw new IllegalArgumentException(
           "CUDA Unified Memory is not supported in CUDA_ASYNC allocation mode");
@@ -242,8 +116,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       loc = logConf.loc;
     }
 
-    initializeInternal(allocationMode, loc.internalId, path, poolSize, maxPoolSize,
-        allocationAlignment, alignmentThreshold);
+    initializeInternal(allocationMode, loc.internalId, path, poolSize);
     MemoryCleaner.setDefaultGpu(Cuda.getDevice());
     initialized = true;
   }
@@ -289,8 +162,7 @@ private static long[] sortThresholds(long[] thresholds) {
   }
 
   private static native void initializeInternal(int allocationMode, int logTo, String path,
-      long poolSize, long maxPoolSize, long allocationAlignment, long alignmentThreshold)
-      throws RmmException;
+      long poolSize) throws RmmException;
 
   /**
    * Shut down any initialized RMM instance.  This should be used very rarely.  It does not need to
diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
index 07983f77aad..408c93ff0a1 100644
--- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
@@ -82,6 +82,19 @@ public static RollingAggregation max() {
     return new RollingAggregation(Aggregation.max());
   }
 
+  /**
+   * Rolling Window Standard Deviation with 1 as delta degrees of freedom(DDOF).
+   */
+  public static RollingAggregation standardDeviation() {
+    return new RollingAggregation(Aggregation.standardDeviation());
+  }
+
+  /**
+   * Rolling Window Standard Deviation with configurable delta degrees of freedom(DDOF).
+   */
+  public static RollingAggregation standardDeviation(int ddof) {
+    return new RollingAggregation(Aggregation.standardDeviation(ddof));
+  }
 
   /**
    * Count number of valid, a.k.a. non-null, elements.
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 0af02d1c926..68e7a21988a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -337,35 +337,53 @@ private static native long[] readORC(String[] filterColumnNames,
   /**
    * Setup everything to write ORC formatted data to a file.
    * @param columnNames     names that correspond to the table columns
+   * @param numChildren     Children of the top level
+   * @param flatNumChildren flattened list of children per column
    * @param nullable        true if the column can have nulls else false
    * @param metadataKeys    Metadata key names to place in the Parquet file
    * @param metadataValues  Metadata values corresponding to metadataKeys
    * @param compression     native compression codec ID
+   * @param precisions      precision list containing all the precisions of the decimal types in
+   *                        the columns
+   * @param isMapValues     true if a column is a map
    * @param filename        local output path
    * @return a handle that is used in later calls to writeORCChunk and writeORCEnd.
    */
   private static native long writeORCFileBegin(String[] columnNames,
+                                               int numChildren,
+                                               int[] flatNumChildren,
                                                boolean[] nullable,
                                                String[] metadataKeys,
                                                String[] metadataValues,
                                                int compression,
+                                               int[] precisions,
+                                               boolean[] isMapValues,
                                                String filename) throws CudfException;
 
   /**
    * Setup everything to write ORC formatted data to a buffer.
    * @param columnNames     names that correspond to the table columns
+   * @param numChildren     Children of the top level
+   * @param flatNumChildren flattened list of children per column
    * @param nullable        true if the column can have nulls else false
    * @param metadataKeys    Metadata key names to place in the Parquet file
    * @param metadataValues  Metadata values corresponding to metadataKeys
    * @param compression     native compression codec ID
+   * @param precisions      precision list containing all the precisions of the decimal types in
+   *                        the columns
+   * @param isMapValues     true if a column is a map
    * @param consumer        consumer of host buffers produced.
    * @return a handle that is used in later calls to writeORCChunk and writeORCEnd.
    */
   private static native long writeORCBufferBegin(String[] columnNames,
+                                                 int numChildren,
+                                                 int[] flatNumChildren,
                                                  boolean[] nullable,
                                                  String[] metadataKeys,
                                                  String[] metadataValues,
                                                  int compression,
+                                                 int[] precisions,
+                                                 boolean[] isMapValues,
                                                  HostBufferConsumer consumer) throws CudfException;
 
   /**
@@ -1079,21 +1097,29 @@ private static class ORCTableWriter implements TableWriter {
     HostBufferConsumer consumer;
 
     private ORCTableWriter(ORCWriterOptions options, File outputFile) {
-      this.handle = writeORCFileBegin(options.getColumnNames(),
-          options.getColumnNullability(),
+      this.handle = writeORCFileBegin(options.getFlatColumnNames(),
+          options.getTopLevelChildren(),
+          options.getFlatNumChildren(),
+          options.getFlatIsNullable(),
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getFlatPrecision(),
+          options.getFlatIsMap(),
           outputFile.getAbsolutePath());
       this.consumer = null;
     }
 
     private ORCTableWriter(ORCWriterOptions options, HostBufferConsumer consumer) {
-      this.handle = writeORCBufferBegin(options.getColumnNames(),
-          options.getColumnNullability(),
+      this.handle = writeORCBufferBegin(options.getFlatColumnNames(),
+          options.getTopLevelChildren(),
+          options.getFlatNumChildren(),
+          options.getFlatIsNullable(),
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getFlatPrecision(),
+          options.getFlatIsMap(),
           consumer);
       this.consumer = consumer;
     }
@@ -1150,7 +1176,7 @@ public void writeORC(File outputFile) {
     // Need to specify the number of columns but leave all column names undefined
     String[] names = new String[getNumberOfColumns()];
     Arrays.fill(names, "");
-    ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(names).build();
+    ORCWriterOptions opts = ORCWriterOptions.builder().withColumns(true, names).build();
     writeORC(opts, outputFile);
   }
 
@@ -1161,7 +1187,7 @@ public void writeORC(File outputFile) {
    */
   @Deprecated
   public void writeORC(ORCWriterOptions options, File outputFile) {
-    assert options.getColumnNames().length == getNumberOfColumns() : "must specify names for all columns";
+    assert options.getTopLevelChildren() == getNumberOfColumns() : "must specify names for all columns";
     try (TableWriter writer = Table.writeORCChunked(options, outputFile)) {
       writer.write(this);
     }
@@ -2016,7 +2042,7 @@ public ColumnVector rowBitCount() {
    * @return the resulting Table.
    */
   public Table gather(ColumnView gatherMap) {
-    return gather(gatherMap, true);
+    return gather(gatherMap, OutOfBoundsPolicy.NULLIFY);
   }
 
   /**
@@ -2027,16 +2053,36 @@ public Table gather(ColumnView gatherMap) {
    *
    * A negative value `i` in the `gatherMap` is interpreted as `i+n`, where
    * `n` is the number of rows in this table.
-
+   *
+   * @deprecated Use {@link #gather(ColumnView, OutOfBoundsPolicy)}
    * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
    * @param checkBounds if true bounds checking is performed on the value. Be very careful
    *                    when setting this to false.
    * @return the resulting Table.
    */
+  @Deprecated
   public Table gather(ColumnView gatherMap, boolean checkBounds) {
     return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
   }
 
+  /**
+   * Gathers the rows of this table according to `gatherMap` such that row "i"
+   * in the resulting table's columns will contain row "gatherMap[i]" from this table.
+   * The number of rows in the result table will be equal to the number of elements in
+   * `gatherMap`.
+   *
+   * A negative value `i` in the `gatherMap` is interpreted as `i+n`, where
+   * `n` is the number of rows in this table.
+   *
+   * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
+   * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`
+   * @return the resulting Table.
+   */
+  public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
+    boolean checkBounds = outOfBoundsPolicy == OutOfBoundsPolicy.NULLIFY;
+    return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
+  }
+
   private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
diff --git a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
index 9ef18dbd75d..6fb5a16d888 100644
--- a/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
+++ b/java/src/main/java/ai/rapids/cudf/ast/UnaryOperator.java
@@ -23,29 +23,32 @@
  * NOTE: This must be kept in sync with `jni_to_unary_operator` in CompiledExpression.cpp!
  */
 public enum UnaryOperator {
-  IDENTITY(0),      // Identity function
-  SIN(1),           // Trigonometric sine
-  COS(2),           // Trigonometric cosine
-  TAN(3),           // Trigonometric tangent
-  ARCSIN(4),        // Trigonometric sine inverse
-  ARCCOS(5),        // Trigonometric cosine inverse
-  ARCTAN(6),        // Trigonometric tangent inverse
-  SINH(7),          // Hyperbolic sine
-  COSH(8),          // Hyperbolic cosine
-  TANH(9),          // Hyperbolic tangent
-  ARCSINH(10),       // Hyperbolic sine inverse
-  ARCCOSH(11),       // Hyperbolic cosine inverse
-  ARCTANH(12),       // Hyperbolic tangent inverse
-  EXP(13),           // Exponential (base e, Euler number)
-  LOG(14),           // Natural Logarithm (base e)
-  SQRT(15),          // Square-root (x^0.5)
-  CBRT(16),          // Cube-root (x^(1.0/3))
-  CEIL(17),          // Smallest integer value not less than arg
-  FLOOR(18),         // largest integer value not greater than arg
-  ABS(19),           // Absolute value
-  RINT(20),          // Rounds the floating-point argument arg to an integer value
-  BIT_INVERT(21),    // Bitwise Not (~)
-  NOT(22);           // Logical Not (!)
+  IDENTITY(0),          // Identity function
+  SIN(1),               // Trigonometric sine
+  COS(2),               // Trigonometric cosine
+  TAN(3),               // Trigonometric tangent
+  ARCSIN(4),            // Trigonometric sine inverse
+  ARCCOS(5),            // Trigonometric cosine inverse
+  ARCTAN(6),            // Trigonometric tangent inverse
+  SINH(7),              // Hyperbolic sine
+  COSH(8),              // Hyperbolic cosine
+  TANH(9),              // Hyperbolic tangent
+  ARCSINH(10),          // Hyperbolic sine inverse
+  ARCCOSH(11),          // Hyperbolic cosine inverse
+  ARCTANH(12),          // Hyperbolic tangent inverse
+  EXP(13),              // Exponential (base e, Euler number)
+  LOG(14),              // Natural Logarithm (base e)
+  SQRT(15),             // Square-root (x^0.5)
+  CBRT(16),             // Cube-root (x^(1.0/3))
+  CEIL(17),             // Smallest integer value not less than arg
+  FLOOR(18),            // largest integer value not greater than arg
+  ABS(19),              // Absolute value
+  RINT(20),             // Rounds the floating-point argument arg to an integer value
+  BIT_INVERT(21),       // Bitwise Not (~)
+  NOT(22),              // Logical Not (!)
+  CAST_TO_INT64(23),    // Cast value to int64_t
+  CAST_TO_UINT64(24),   // Cast value to uint64_t
+  CAST_TO_FLOAT64(25);  // Cast value to double
 
   private final byte nativeId;
 
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 88b20414b0c..1ab3b97945d 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,199 +17,302 @@
 package ai.rapids.cudf.nvcomp;
 
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.DeviceMemoryBuffer;
 import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
 
 /** Multi-buffer LZ4 compressor */
 public class BatchedLZ4Compressor {
-  /** Describes a batched compression result */
-  public static class BatchedCompressionResult {
-    private final DeviceMemoryBuffer[] compressedBuffers;
-    private final long[] compressedSizes;
-
-    BatchedCompressionResult(DeviceMemoryBuffer[] buffers, long[] sizes) {
-      this.compressedBuffers = buffers;
-      this.compressedSizes = sizes;
-    }
+  static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
+  // each chunk has a 64-bit integer value as metadata containing the compressed size
+  static final long METADATA_BYTES_PER_CHUNK = 8;
 
-    /**
-     * Get the output compressed buffers corresponding to the input buffers.
-     * Note that the buffers are likely larger than required to store the compressed data.
-     */
-    public DeviceMemoryBuffer[] getCompressedBuffers() {
-      return compressedBuffers;
-    }
-
-    /** Get the corresponding amount of compressed data in each output buffer. */
-    public long[] getCompressedSizes() {
-      return compressedSizes;
-    }
-  }
+  private final long chunkSize;
+  private final long targetIntermediateBufferSize;
+  private final long maxOutputChunkSize;
 
   /**
-   * Get the amount of temporary storage space required to compress a batch of buffers.
-   * @param inputs    batch of data buffers to be individually compressed
-   * @param chunkSize compression chunk size to use
-   * @return amount in bytes of temporary storage space required to compress the batch
+   * Construct a batched LZ4 compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs
+   *                  larger than this will be compressed in multiple chunks.
+   * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                     used during compression.
    */
-  public static long getTempSize(BaseDeviceMemoryBuffer[] inputs, long chunkSize) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-    return NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize);
+  public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) {
+    validateChunkSize(chunkSize);
+    this.chunkSize = chunkSize;
+    this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize);
+    assert maxOutputChunkSize < Integer.MAX_VALUE;
+    this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize);
   }
 
   /**
-   * Get the amount of output storage space required to compress a batch of buffers.
-   * @param inputs     batch of data buffers to be individually compressed
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @return amount in bytes of output storage space corresponding to each input buffer in the batch
+   * Compress a batch of buffers with LZ4. The input buffers will be closed.
+   * @param origInputs buffers to compress
+   * @param stream CUDA stream to use
+   * @return compressed buffers corresponding to the input buffers
    */
-  public static long[] getOutputSizes(BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                      BaseDeviceMemoryBuffer tempBuffer) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
+      if (chunkSize <= 0) {
+        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return new DeviceMemoryBuffer[0];
+      }
+
+      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
+      // chunks are needed for each input buffer.
+      int[] chunksPerInput = new int[numInputs];
+      int numChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        BaseDeviceMemoryBuffer buffer = inputs.get(i);
+        int numBufferChunks = getNumChunksInBuffer(buffer);
+        chunksPerInput[i] = numBufferChunks;
+        numChunks += numBufferChunks;
+      }
+
+      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
+      // chunk destination addresses, and sizes.
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+               allocCompressedBuffers(numChunks, stream);
+           DeviceMemoryBuffer compressedChunkSizes =
+               DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
+        long[] inputChunkAddrs = new long[numChunks];
+        long[] inputChunkSizes = new long[numChunks];
+        long[] outputChunkAddrs = new long[numChunks];
+        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes,
+            compressedBuffers, outputChunkAddrs);
+
+        long[] outputChunkSizes;
+        final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize);
+        try (DeviceMemoryBuffer addrsAndSizes =
+                 putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream);
+             DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
+          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
+          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
+          NvcompJni.batchedLZ4CompressAsync(
+              addrsAndSizes.getAddress(),
+              devInputSizesPtr,
+              chunkSize,
+              numChunks,
+              tempBuffer.getAddress(),
+              tempBufferSize,
+              devOutputAddrsPtr,
+              compressedChunkSizes.getAddress(),
+              stream.getStream());
+        }
+
+        // Synchronously copy the resulting compressed sizes per chunk.
+        outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
+
+        // inputs are no longer needed at this point, so free them early
+        inputs.close();
+
+        // Combine compressed chunks into output buffers corresponding to each original input
+        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
+            outputChunkSizes, stream);
+      }
     }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+  }
+
+  static void validateChunkSize(long chunkSize) {
+    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
+      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: "
+          + MAX_CHUNK_SIZE + " bytes");
     }
-    return NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes, chunkSize,
-        tempBuffer.getAddress(), tempBuffer.getLength());
   }
 
-  /**
-   * Calculates the minimum size in bytes necessary to store the compressed output sizes
-   * when performing an asynchronous batch compression.
-   * @param numBuffers number of buffers in the batch
-   * @return minimum size of the compressed output sizes buffer needed
-   */
-  public static long getCompressedSizesBufferSize(int numBuffers) {
-    // Each compressed size value is a 64-bit long
-    return numBuffers * 8;
+  private static long ceilingDivide(long x, long y) {
+    return (x + y - 1) / y;
   }
 
-  /**
-   * Asynchronously compress a batch of input buffers. The compressed size output buffer must be
-   * pinned memory for this operation to be truly asynchronous. Note that the caller must
-   * synchronize on the specified CUDA stream in order to safely examine the compressed output
-   * sizes!
-   * @param compressedSizesOutputBuffer host memory where the compressed output size will be stored
-   * @param inputs     buffers to compress
-   * @param chunkSize  type of data within each buffer
-   * @param tempBuffer compression chunk size to use
-   * @param outputs    output buffers that will contain the compressed results
-   * @param stream     CUDA stream to use
-   */
-  public static void compressAsync(HostMemoryBuffer compressedSizesOutputBuffer,
-                                   BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                   BaseDeviceMemoryBuffer tempBuffer,
-                                   BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    int numBuffers = inputs.length;
-    if (outputs.length != numBuffers) {
-      throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " +
-          outputs.length + " outputs");
-    }
-    if (compressedSizesOutputBuffer.getLength() < getCompressedSizesBufferSize(numBuffers)) {
-      throw new IllegalArgumentException("compressed output size buffer must be able to hold " +
-          "at least 8 bytes per buffer, size is only " + compressedSizesOutputBuffer.getLength());
-    }
+  private int getNumChunksInBuffer(MemoryBuffer buffer) {
+    return (int) ceilingDivide(buffer.getLength(), chunkSize);
+  }
 
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
+                                                                    Cuda.Stream stream) {
+    final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize;
+    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
+    if (numBuffers > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Too many chunks");
     }
-
-    long[] outputAddrs = new long[numBuffers];
-    long[] outputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = outputs[i];
-      outputAddrs[i] = buffer.getAddress();
-      outputSizes[i] = buffer.getLength();
+    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
+      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
+          new DeviceMemoryBuffer[(int) numBuffers]);
+      try {
+        // allocate all of the max-chunks intermediate compressed buffers
+        for (int i = 0; i < buffers.size() - 1; ++i) {
+          buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
+        }
+        // allocate the tail intermediate compressed buffer that may be smaller than the others
+        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
+            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
+        return buffers;
+      } catch (Exception e) {
+        buffers.close(e);
+        throw e;
+      }
     }
-
-    NvcompJni.batchedLZ4CompressAsync(compressedSizesOutputBuffer.getAddress(),
-        inputAddrs, inputSizes, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(),
-        outputAddrs, outputSizes, stream.getStream());
   }
 
-  /**
-   * Compress a batch of buffers with LZ4
-   * @param inputs    buffers to compress
-   * @param chunkSize compression chunk size to use
-   * @param stream    CUDA stream to use
-   * @return compression results containing the corresponding output buffer and compressed data size
-   *         for each input buffer
-   */
-  public static BatchedCompressionResult compress(BaseDeviceMemoryBuffer[] inputs, long chunkSize,
-                                                  Cuda.Stream stream) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
+  // into the chunks in the input and output buffers.
+  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
+                                  long[] inputChunkAddrs,
+                                  long[] inputChunkSizes,
+                                  CloseableArray<DeviceMemoryBuffer> compressedBuffers,
+                                  long[] outputChunkAddrs) {
+    // setup the input addresses and sizes
+    int chunkIdx = 0;
+    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
+      final int numChunksInBuffer = getNumChunksInBuffer(input);
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
+        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
+            : (input.getLength() - (long) i * chunkSize);
+        ++chunkIdx;
+      }
     }
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
+    assert chunkIdx == inputChunkAddrs.length;
+    assert chunkIdx == inputChunkSizes.length;
+
+    // setup output addresses
+    chunkIdx = 0;
+    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
+      assert buffer.getLength() % maxOutputChunkSize == 0;
+      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
+      long baseAddr = buffer.getAddress();
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
+      }
     }
+    assert chunkIdx == outputChunkAddrs.length;
+  }
 
-    DeviceMemoryBuffer[] outputBuffers = new DeviceMemoryBuffer[numBuffers];
-    try {
-      long tempSize = NvcompJni.batchedLZ4CompressGetTempSize(inputAddrs, inputSizes, chunkSize);
-      try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-        long[] outputSizes = NvcompJni.batchedLZ4CompressGetOutputSize(inputAddrs, inputSizes,
-                chunkSize, tempBuffer.getAddress(), tempBuffer.getLength());
-        long[] outputAddrs = new long[numBuffers];
-        for (int i = 0; i < numBuffers; ++i) {
-          DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]);
-          outputBuffers[i] = buffer;
-          outputAddrs[i] = buffer.getAddress();
+  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
+  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
+                                                      long[] inputSizes,
+                                                      long[] outputAddrs,
+                                                      Cuda.Stream stream) {
+    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
+    final long outputAddrsOffset = inputAddrs.length * 8L;
+    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
+    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
+        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
+        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
+        for (int i = 0; i < inputSizes.length; i++) {
+          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
         }
+        result.copyFromHostBuffer(hostbuf, stream);
+        result.incRefCount();
+        return result;
+      }
+    }
+  }
 
-        long compressedSizesBufferSize = getCompressedSizesBufferSize(numBuffers);
-        try (HostMemoryBuffer compressedSizesBuffer =
-                 HostMemoryBuffer.allocate(compressedSizesBufferSize)) {
-          NvcompJni.batchedLZ4CompressAsync(compressedSizesBuffer.getAddress(),
-              inputAddrs, inputSizes, chunkSize,
-              tempBuffer.getAddress(), tempBuffer.getLength(),
-              outputAddrs, outputSizes, stream.getStream());
-          stream.sync();
-          long[] compressedSizes = new long[numBuffers];
-          compressedSizesBuffer.getLongs(compressedSizes, 0, 0, numBuffers);
-          return new BatchedCompressionResult(outputBuffers, compressedSizes);
+  // Synchronously copy the resulting compressed sizes from device memory to host memory.
+  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
+        int numChunks = (int) (devChunkSizes.getLength() / 8);
+        long[] result = new long[numChunks];
+        for (int i = 0; i < numChunks; i++) {
+          long size = hostbuf.getLong(i * 8L);
+          assert size < Integer.MAX_VALUE : "output size is too big";
+          result[i] = size;
         }
+        return result;
       }
-    } catch (Throwable t) {
-      for (DeviceMemoryBuffer buffer : outputBuffers) {
-        if (buffer != null) {
-          buffer.close();
+    }
+  }
+
+  // Stitch together the individual chunks into the result buffers.
+  // Each result buffer has metadata at the beginning, followed by compressed chunks.
+  // This is done by building up parallel lists of source addr, dest addr and size and
+  // then calling multiBufferCopyAsync()
+  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
+                                            DeviceMemoryBuffer compressedChunkSizes,
+                                            long[] outputChunkAddrs,
+                                            long[] outputChunkSizes,
+                                            Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
+      final int numOutputs = chunksPerInput.length;
+      final long chunkSizesAddr = compressedChunkSizes.getAddress();
+      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
+      try (CloseableArray<DeviceMemoryBuffer> outputs =
+               CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
+        // Each chunk needs to be copied, and each output needs a copy of the
+        // compressed chunk size vector representing the metadata.
+        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
+        long[] destAddrs = new long[totalBuffersToCopy];
+        long[] srcAddrs = new long[totalBuffersToCopy];
+        long[] sizes = new long[totalBuffersToCopy];
+        int copyBufferIdx = 0;
+        int chunkIdx = 0;
+        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
+          DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
+          final long outputBufferAddr = outputBuffer.getAddress();
+          outputs.set(outputIdx, outputBuffer);
+          final long numChunks = chunksPerInput[outputIdx];
+          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
+
+          // setup a copy of the metadata at the front of the output buffer
+          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
+          destAddrs[copyBufferIdx] = outputBufferAddr;
+          sizes[copyBufferIdx] = metadataSize;
+          ++copyBufferIdx;
+
+          // setup copies of the compressed chunks for this output buffer
+          long nextChunkAddr = outputBufferAddr + metadataSize;
+          for (int i = 0; i < numChunks; ++i) {
+            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
+            destAddrs[copyBufferIdx] = nextChunkAddr;
+            final long chunkSize = outputChunkSizes[chunkIdx];
+            sizes[copyBufferIdx] = chunkSize;
+            copyBufferIdx++;
+            chunkIdx++;
+            nextChunkAddr += chunkSize;
+          }
         }
+        assert copyBufferIdx == totalBuffersToCopy;
+        assert chunkIdx == outputChunkAddrs.length;
+        assert chunkIdx == outputChunkSizes.length;
+
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        return outputs.release();
       }
-      throw t;
     }
   }
 
-
+  // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks)
+  private long[] calcOutputBufferSizes(int[] chunksPerInput,
+                                       long[] outputChunkSizes) {
+    long[] sizes = new long[chunksPerInput.length];
+    int chunkIdx = 0;
+    for (int i = 0; i < sizes.length; i++) {
+      final int chunksInBuffer = chunksPerInput[i];
+      final int chunkEndIdx = chunkIdx + chunksInBuffer;
+      // metadata stored in front of compressed data
+      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
+      // add in the compressed chunk sizes to get the total size
+      while (chunkIdx < chunkEndIdx) {
+        bufferSize += outputChunkSizes[chunkIdx++];
+      }
+      sizes[i] = bufferSize;
+    }
+    assert chunkIdx == outputChunkSizes.length;
+    return sizes;
+  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
index 61969db4fb4..40ad4d5e9ed 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,214 +16,183 @@
 
 package ai.rapids.cudf.nvcomp;
 
+import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
 import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.MemoryCleaner;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
 
 /** LZ4 decompressor that operates on multiple input buffers in a batch */
 public class BatchedLZ4Decompressor {
-  private static final Logger log = LoggerFactory.getLogger(Decompressor.class);
-
-  /**
-   * Get the metadata associated with a batch of compressed buffers
-   * @param inputs compressed buffers that will be decompressed
-   * @param stream CUDA stream to use
-   * @return opaque metadata object
-   */
-  public static BatchedMetadata getMetadata(BaseDeviceMemoryBuffer[] inputs, Cuda.Stream stream) {
-    long[] inputAddrs = new long[inputs.length];
-    long[] inputSizes = new long[inputs.length];
-    for (int i = 0; i < inputs.length; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-    return new BatchedMetadata(NvcompJni.batchedLZ4DecompressGetMetadata(
-        inputAddrs, inputSizes, stream.getStream()));
-  }
-
-  /**
-   * Get the amount of temporary storage required to decompress a batch of buffers
-   * @param metadata metadata retrieved from the compressed buffers
-   * @return amount in bytes of temporary storage space required to decompress the buffer batch
-   */
-  public static long getTempSize(BatchedMetadata metadata) {
-    return NvcompJni.batchedLZ4DecompressGetTempSize(metadata.getMetadata());
-  }
-
-  /**
-   * Get the amount of ouptut storage required to decopmress a batch of buffers
-   * @param metadata   metadata retrieved from the compressed buffers
-   * @param numOutputs number of buffers in the batch
-   * @return amount in bytes of temporary storage space required to decompress the buffer batch
-   */
-  public static long[] getOutputSizes(BatchedMetadata metadata, int numOutputs) {
-    return NvcompJni.batchedLZ4DecompressGetOutputSize(metadata.getMetadata(), numOutputs);
-  }
-
   /**
    * Asynchronously decompress a batch of buffers
-   * @param inputs     buffers to decompress
-   * @param tempBuffer temporary buffer
-   * @param metadata   metadata retrieved from the compressed buffers
-   * @param outputs    output buffers that will contain the compressed results
-   * @param stream     CUDA stream to use
+   * @param chunkSize maximum uncompressed block size, must match value used during compression
+   * @param origInputs buffers to decompress, will be closed by this operation
+   * @param outputs output buffers that will contain the compressed results, each must be sized
+   *                to the exact decompressed size of the corresponding input
+   * @param stream CUDA stream to use
    */
-  public static void decompressAsync(BaseDeviceMemoryBuffer[] inputs,
-                                     BaseDeviceMemoryBuffer tempBuffer, BatchedMetadata metadata,
-                                     BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
-    int numBuffers = inputs.length;
-    if (outputs.length != numBuffers) {
-      throw new IllegalArgumentException("buffer count mismatch, " + numBuffers + " inputs and " +
-          outputs.length + " outputs");
-    }
+  public static void decompressAsync(long chunkSize,
+                                     BaseDeviceMemoryBuffer[] origInputs,
+                                     BaseDeviceMemoryBuffer[] outputs,
+                                     Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
+             CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
+      BatchedLZ4Compressor.validateChunkSize(chunkSize);
+      if (origInputs.length != outputs.length) {
+        throw new IllegalArgumentException("number of inputs must match number of outputs");
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return;
+      }
 
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
+      int[] chunksPerInput = new int[numInputs];
+      long totalChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        // use output size to determine number of chunks in the input, as the output buffer
+        // must be exactly sized to the uncompressed data
+        BaseDeviceMemoryBuffer buffer = outputs[i];
+        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
+        chunksPerInput[i] = numBufferChunks;
+        totalChunks += numBufferChunks;
+      }
 
-    long[] outputAddrs = new long[numBuffers];
-    long[] outputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = outputs[i];
-      outputAddrs[i] = buffer.getAddress();
-      outputSizes[i] = buffer.getLength();
+      final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize);
+      try (DeviceMemoryBuffer devAddrsSizes =
+               buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput,
+                   outputs, stream);
+           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
+        // buffer containing addresses and sizes contains four vectors of longs in this order:
+        // - compressed chunk input addresses
+        // - chunk output buffer addresses
+        // - compressed chunk sizes
+        // - uncompressed chunk sizes
+        final long inputAddrsPtr = devAddrsSizes.getAddress();
+        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
+        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
+        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
+        NvcompJni.batchedLZ4DecompressAsync(
+            inputAddrsPtr,
+            inputSizesPtr,
+            outputSizesPtr,
+            totalChunks,
+            devTemp.getAddress(),
+            devTemp.getLength(),
+            outputAddrsPtr,
+            stream.getStream());
+      }
     }
+  }
 
-    NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes,
-        tempBuffer.getAddress(), tempBuffer.getLength(), metadata.getMetadata(),
-        outputAddrs, outputSizes, stream.getStream());
+  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
+    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
   }
 
   /**
-   * Asynchronously decompress a batch of buffers
-   * @param inputs buffers to decompress
+   * Build a device memory buffer containing four vectors of longs in the following order:
+   * <ul>
+   *   <li>compressed chunk input addresses</li>
+   *   <li>uncompressed chunk output addresses</li>
+   *   <li>compressed chunk sizes</li>
+   *   <li>uncompressed chunk sizes</li>
+   * </ul>
+   * Each vector contains as many longs as the number of chunks being decompressed
+   * @param chunkSize maximum uncompressed size of a chunk
+   * @param totalChunks total number of chunks to be decompressed
+   * @param inputs device buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks per input buffer
+   * @param outputs device buffers that will hold the uncompressed output
    * @param stream CUDA stream to use
-   * @return output buffers containing compressed data corresponding to the input buffers
+   * @return device buffer containing address and size vectors
    */
-  public static DeviceMemoryBuffer[] decompressAsync(BaseDeviceMemoryBuffer[] inputs,
-                                                     Cuda.Stream stream) {
-    int numBuffers = inputs.length;
-    long[] inputAddrs = new long[numBuffers];
-    long[] inputSizes = new long[numBuffers];
-    for (int i = 0; i < numBuffers; ++i) {
-      BaseDeviceMemoryBuffer buffer = inputs[i];
-      inputAddrs[i] = buffer.getAddress();
-      inputSizes[i] = buffer.getLength();
-    }
-
-    long metadata = NvcompJni.batchedLZ4DecompressGetMetadata(inputAddrs, inputSizes,
-            stream.getStream());
-    try {
-      long[] outputSizes = NvcompJni.batchedLZ4DecompressGetOutputSize(metadata, numBuffers);
-      long[] outputAddrs = new long[numBuffers];
-      DeviceMemoryBuffer[] outputs = new DeviceMemoryBuffer[numBuffers];
-      try {
-        for (int i = 0; i < numBuffers; ++i) {
-          DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(outputSizes[i]);
-          outputs[i] = buffer;
-          outputAddrs[i] = buffer.getAddress();
-        }
-
-        long tempSize = NvcompJni.batchedLZ4DecompressGetTempSize(metadata);
-        try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-          NvcompJni.batchedLZ4DecompressAsync(inputAddrs, inputSizes,
-                  tempBuffer.getAddress(), tempBuffer.getLength(), metadata,
-                  outputAddrs, outputSizes, stream.getStream());
-        }
-      } catch (Throwable t) {
-        for (DeviceMemoryBuffer buffer : outputs) {
-          if (buffer != null) {
-            buffer.close();
+  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize,
+                                                          long totalChunks,
+                                                          BaseDeviceMemoryBuffer[] inputs,
+                                                          int[] chunksPerInput,
+                                                          BaseDeviceMemoryBuffer[] outputs,
+                                                          Cuda.Stream stream) {
+    final long totalBufferSize = totalChunks * 8L * 4L;
+    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
+           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
+           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
+        // Build four long vectors in the AddrsSizes buffer:
+        // - compressed input address (one per chunk)
+        // - uncompressed output address (one per chunk)
+        // - compressed input size (one per chunk)
+        // - uncompressed input size (one per chunk)
+        final long srcAddrsOffset = 0;
+        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
+        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
+        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
+        long chunkIdx = 0;
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
+          final int numChunksInInput = chunksPerInput[inputIdx];
+          long srcAddr = input.getAddress() +
+              BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
+          long destAddr = output.getAddress();
+          final long chunkIdxEnd = chunkIdx + numChunksInInput;
+          while (chunkIdx < chunkIdxEnd) {
+            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
+            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
+                : output.getAddress() + output.getLength() - destAddr;
+            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
+            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
+            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
+            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
+            srcAddr += srcChunkSize;
+            destAddr += destChunkSize;
+            ++chunkIdx;
           }
         }
-        throw t;
+        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
+        devAddrsSizes.incRefCount();
+        return devAddrsSizes;
       }
-
-      return outputs;
-    } finally {
-      NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata);
     }
   }
 
-  /** Opaque metadata object for batched LZ4 decompression */
-  public static class BatchedMetadata implements AutoCloseable {
-    private final BatchedMetadataCleaner cleaner;
-    private final long id;
-    private boolean closed = false;
-
-    BatchedMetadata(long metadata) {
-      this.cleaner = new BatchedMetadataCleaner(metadata);
-      this.id = cleaner.id;
-      MemoryCleaner.register(this, cleaner);
-      cleaner.addRef();
-    }
-
-    long getMetadata() {
-      return cleaner.metadata;
-    }
-
-    public boolean isLZ4Metadata() {
-      return NvcompJni.isLZ4Metadata(getMetadata());
-    }
-
-    @Override
-    public synchronized void close() {
-      if (!closed) {
-        cleaner.delRef();
-        cleaner.clean(false);
-        closed = true;
-      } else {
-        cleaner.logRefCountDebug("double free " + this);
-        throw new IllegalStateException("Close called too many times " + this);
-      }
-    }
-
-    @Override
-    public String toString() {
-      return "LZ4 BATCHED METADATA (ID: " + id + " " +
-          Long.toHexString(cleaner.metadata) + ")";
-    }
-
-    private static class BatchedMetadataCleaner extends MemoryCleaner.Cleaner {
-      private long metadata;
-
-      BatchedMetadataCleaner(long metadata) {
-        this.metadata = metadata;
-      }
-
-      @Override
-      protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
-        boolean neededCleanup = false;
-        long address = metadata;
-        if (metadata != 0) {
-          try {
-            NvcompJni.batchedLZ4DecompressDestroyMetadata(metadata);
-          } finally {
-            // Always mark the resource as freed even if an exception is thrown.
-            // We cannot know how far it progressed before the exception, and
-            // therefore it is unsafe to retry.
-            metadata = 0;
-          }
-          neededCleanup = true;
-        }
-        if (neededCleanup && logErrorIfNotClean) {
-          log.error("LZ4 BATCHED METADATA WAS LEAKED (Address: " + Long.toHexString(address) + ")");
-          logRefCountDebug("Leaked event");
+  /**
+   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
+   * @param totalChunks total number of compressed chunks
+   * @param inputs buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks for the corresponding input
+   * @param stream CUDA stream to use
+   * @return host buffer containing all of the metadata
+   */
+  private static HostMemoryBuffer fetchMetadata(long totalChunks,
+                                                BaseDeviceMemoryBuffer[] inputs,
+                                                int[] chunksPerInput,
+                                                Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
+      // one long per chunk containing the compressed size
+      final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
+      // Build corresponding vectors of destination addresses, source addresses and sizes.
+      long[] destAddrs = new long[inputs.length];
+      long[] srcAddrs = new long[inputs.length];
+      long[] sizes = new long[inputs.length];
+      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
+           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
+        long destCopyAddr = devMetadata.getAddress();
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
+          destAddrs[inputIdx] = destCopyAddr;
+          srcAddrs[inputIdx] = input.getAddress();
+          sizes[inputIdx] = copySize;
+          destCopyAddr += copySize;
         }
-        return neededCleanup;
-      }
-
-      @Override
-      public boolean isClean() {
-        return metadata != 0;
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
+        hostMetadata.incRefCount();
+        return hostMetadata;
       }
     }
   }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
index 5a133acbf7c..70f0a021a4d 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/CompressionType.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,10 @@ public enum CompressionType {
   INT(4),
   UINT(5),
   LONGLONG(6),
-  ULONGLONG(7);
+  ULONGLONG(7),
+  BITS(0xff);
+
+  private static final CompressionType[] types = CompressionType.values();
 
   final int nativeId;
 
@@ -33,6 +36,17 @@ public enum CompressionType {
     this.nativeId = nativeId;
   }
 
+  /** Lookup the CompressionType that corresponds to the specified native identifier */
+  public static CompressionType fromNativeId(int id) {
+    for (CompressionType type : types) {
+      if (type.nativeId == id) {
+        return type;
+      }
+    }
+    throw new IllegalArgumentException("Unknown compression type ID: " + id);
+  }
+
+  /** Get the native code identifier for the type */
   public final int toNativeId() {
     return nativeId;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java
deleted file mode 100644
index 90dabfbcf8e..00000000000
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/Decompressor.java
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package ai.rapids.cudf.nvcomp;
-
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.MemoryCleaner;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/** Generic single-buffer decompressor interface */
-public class Decompressor {
-  private static final Logger log = LoggerFactory.getLogger(Decompressor.class);
-
-  /**
-   * Get the metadata associated with a compressed buffer
-   * @param buffer compressed data buffer
-   * @param stream CUDA stream to use
-   * @return opaque metadata object
-   */
-  public static Metadata getMetadata(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) {
-    long metadata = NvcompJni.decompressGetMetadata(buffer.getAddress(), buffer.getLength(),
-        stream.getStream());
-    return new Metadata(metadata);
-  }
-
-  /**
-   * Get the amount of temporary storage space required to decompress a buffer.
-   * @param metadata metadata retrieved from the compressed data
-   * @return amount in bytes of temporary storage space required to decompress
-   */
-  public static long getTempSize(Metadata metadata) {
-    return NvcompJni.decompressGetTempSize(metadata.getMetadata());
-  }
-
-  /**
-   * Get the amount of output storage space required to hold the uncompressed data.
-   * @param metadata metadata retrieved from the compressed data
-   * @return amount in bytes of output storage space required to decompress
-   */
-  public static long getOutputSize(Metadata metadata) {
-    return NvcompJni.decompressGetOutputSize(metadata.getMetadata());
-  }
-
-  /**
-   * Asynchronously decompress a buffer.
-   * @param input      compressed data buffer
-   * @param tempBuffer temporary storage buffer
-   * @param metadata   metadata retrieved from compressed data
-   * @param output     output storage buffer
-   * @param stream     CUDA stream to use
-   */
-  public static void decompressAsync(BaseDeviceMemoryBuffer input, BaseDeviceMemoryBuffer tempBuffer,
-      Metadata metadata, BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
-    NvcompJni.decompressAsync(
-        input.getAddress(), input.getLength(),
-        tempBuffer.getAddress(), tempBuffer.getLength(),
-        metadata.getMetadata(),
-        output.getAddress(), output.getLength(),
-        stream.getStream());
-  }
-
-  /**
-   * Determine if a buffer is data compressed with LZ4.
-   * @param buffer data to examine
-   * @return true if the data is LZ4 compressed
-   */
-  public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer) {
-    return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength());
-  }
-
-
-  /** Opaque metadata object for single-buffer decompression */
-  public static class Metadata implements AutoCloseable {
-    private final MetadataCleaner cleaner;
-    private final long id;
-    private boolean closed = false;
-
-    Metadata(long metadata) {
-      this.cleaner = new MetadataCleaner(metadata);
-      this.id = cleaner.id;
-      MemoryCleaner.register(this, cleaner);
-      cleaner.addRef();
-    }
-
-    long getMetadata() {
-      return cleaner.metadata;
-    }
-
-    /**
-     * Determine if this metadata is associated with LZ4-compressed data
-     * @return true if the metadata is associated with LZ4-compressed data
-     */
-    public boolean isLZ4Metadata() {
-      return NvcompJni.isLZ4Metadata(getMetadata());
-    }
-
-    @Override
-    public synchronized void close() {
-      if (!closed) {
-        cleaner.delRef();
-        cleaner.clean(false);
-        closed = true;
-      } else {
-        cleaner.logRefCountDebug("double free " + this);
-        throw new IllegalStateException("Close called too many times " + this);
-      }
-    }
-
-    @Override
-    public String toString() {
-      return "DECOMPRESSOR METADATA (ID: " + id + " " +
-          Long.toHexString(cleaner.metadata) + ")";
-    }
-
-    private static class MetadataCleaner extends MemoryCleaner.Cleaner {
-      private long metadata;
-
-      MetadataCleaner(long metadata) {
-        this.metadata = metadata;
-      }
-
-      @Override
-      protected synchronized boolean cleanImpl(boolean logErrorIfNotClean) {
-        boolean neededCleanup = false;
-        long address = metadata;
-        if (metadata != 0) {
-          try {
-            NvcompJni.decompressDestroyMetadata(metadata);
-          } finally {
-            // Always mark the resource as freed even if an exception is thrown.
-            // We cannot know how far it progressed before the exception, and
-            // therefore it is unsafe to retry.
-            metadata = 0;
-          }
-          neededCleanup = true;
-        }
-        if (neededCleanup && logErrorIfNotClean) {
-          log.error("DECOMPRESSOR METADATA WAS LEAKED (Address: " +
-              Long.toHexString(address) + ")");
-          logRefCountDebug("Leaked event");
-        }
-        return neededCleanup;
-      }
-
-      @Override
-      public boolean isClean() {
-        return metadata != 0;
-      }
-    }
-  }
-}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
index ce7012a3bee..67a770f1346 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,46 +18,54 @@
 
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
 import ai.rapids.cudf.HostMemoryBuffer;
 
 /** Single-buffer compressor implementing LZ4 */
 public class LZ4Compressor {
 
-  /**
-   * Calculate the amount of temporary storage space required to compress a buffer.
-   * @param input     buffer to compress
-   * @param inputType type of data within the buffer
-   * @param chunkSize compression chunk size to use
-   * @return amount in bytes of temporary storage space required to compress the buffer
-   */
-  public static long getTempSize(BaseDeviceMemoryBuffer input, CompressionType inputType,
-                                 long chunkSize) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+  /** LZ4 compression settings corresponding to a chunk size */
+  public static final class Configuration {
+    private final long metadataBytes;
+    private final long tempBytes;
+    private final long maxCompressedBytes;
+
+    Configuration(long metadataBytes, long tempBytes, long maxCompressedBytes) {
+      this.metadataBytes = metadataBytes;
+      this.tempBytes = tempBytes;
+      this.maxCompressedBytes = maxCompressedBytes;
+    }
+
+    /** Get the size of the metadata information in bytes */
+    public long getMetadataBytes() {
+      return metadataBytes;
+    }
+
+    /** Get the size of the temporary storage in bytes needed to compress */
+    public long getTempBytes() {
+      return tempBytes;
+    }
+
+    /** Get the maximum compressed output size in bytes */
+    public long getMaxCompressedBytes() {
+      return maxCompressedBytes;
     }
-    return NvcompJni.lz4CompressGetTempSize(input.getAddress(), input.getLength(),
-        inputType.nativeId, chunkSize);
   }
 
   /**
-   * Calculate the amount of output storage space required to compress a buffer.
-   * @param input      buffer to compress
-   * @param inputType  type of data within the buffer
-   * @param chunkSize  compression chunk size to use
-   * @param tempBuffer temporary storage space
-   * @return amount in bytes of output storage space required to compress the buffer
+   * Get the compression configuration necessary for a particular chunk size.
+   * @param chunkSize size of an LZ4 chunk in bytes
+   * @param uncompressedSize total size of the uncompressed data
+   * @return compression configuration for the specified chunk size
    */
-  public static long getOutputSize(BaseDeviceMemoryBuffer input, CompressionType inputType,
-                                   long chunkSize, BaseDeviceMemoryBuffer tempBuffer) {
-    if (chunkSize <= 0) {
-      throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-    }
-    return NvcompJni.lz4CompressGetOutputSize(input.getAddress(), input.getLength(),
-        inputType.nativeId, chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(), false);
+  public static Configuration configure(long chunkSize, long uncompressedSize) {
+    long[] configs = NvcompJni.lz4CompressConfigure(chunkSize, uncompressedSize);
+    assert configs.length == 3;
+    return new Configuration(configs[0], configs[1], configs[2]);
   }
 
   /**
-   * Compress a buffer with LZ4.
+   * Synchronously compress a buffer with LZ4.
    * @param input      buffer to compress
    * @param inputType  type of data within the buffer
    * @param chunkSize  compression chunk size to use
@@ -72,16 +80,19 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT
     if (chunkSize <= 0) {
       throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
     }
-    return NvcompJni.lz4Compress(input.getAddress(), input.getLength(), inputType.nativeId,
-        chunkSize, tempBuffer.getAddress(), tempBuffer.getLength(),
-        output.getAddress(), output.getLength(), stream.getStream());
+    try (DeviceMemoryBuffer devOutputSizeBuffer = DeviceMemoryBuffer.allocate(Long.BYTES);
+         HostMemoryBuffer hostOutputSizeBuffer = HostMemoryBuffer.allocate(Long.BYTES)) {
+      compressAsync(devOutputSizeBuffer, input, inputType, chunkSize, tempBuffer, output, stream);
+      hostOutputSizeBuffer.copyFromDeviceBuffer(devOutputSizeBuffer, stream);
+      return hostOutputSizeBuffer.getLong(0);
+    }
   }
 
   /**
    * Asynchronously compress a buffer with LZ4. The compressed size output buffer must be pinned
    * memory for this operation to be truly asynchronous. Note that the caller must synchronize
    * on the specified CUDA stream in order to safely examine the compressed output size!
-   * @param compressedSizeOutputBuffer host memory where the compressed output size will be stored
+   * @param compressedSizeOutputBuffer device memory where the compressed output size will be stored
    * @param input      buffer to compress
    * @param inputType  type of data within the buffer
    * @param chunkSize  compression chunk size to use
@@ -89,7 +100,7 @@ public static long compress(BaseDeviceMemoryBuffer input, CompressionType inputT
    * @param output     buffer that will contain the compressed result
    * @param stream     CUDA stream to use
    */
-  public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer,
+  public static void compressAsync(DeviceMemoryBuffer compressedSizeOutputBuffer,
                                    BaseDeviceMemoryBuffer input, CompressionType inputType,
                                    long chunkSize, BaseDeviceMemoryBuffer tempBuffer,
                                    BaseDeviceMemoryBuffer output, Cuda.Stream stream) {
@@ -100,9 +111,16 @@ public static void compressAsync(HostMemoryBuffer compressedSizeOutputBuffer,
       throw new IllegalArgumentException("compressed output size buffer must be able to hold " +
           "at least 8 bytes, size is only " + compressedSizeOutputBuffer.getLength());
     }
-    NvcompJni.lz4CompressAsync(compressedSizeOutputBuffer.getAddress(),
-        input.getAddress(), input.getLength(), inputType.nativeId, chunkSize,
-        tempBuffer.getAddress(), tempBuffer.getLength(), output.getAddress(), output.getLength(),
+    NvcompJni.lz4CompressAsync(
+        compressedSizeOutputBuffer.getAddress(),
+        input.getAddress(),
+        input.getLength(),
+        inputType.nativeId,
+        chunkSize,
+        tempBuffer.getAddress(),
+        tempBuffer.getLength(),
+        output.getAddress(),
+        output.getLength(),
         stream.getStream());
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
new file mode 100644
index 00000000000..46b3127581b
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/LZ4Decompressor.java
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.Cuda;
+
+/** Single-buffer decompression using LZ4 */
+public class LZ4Decompressor {
+
+  /**
+   * LZ4 decompression settings corresponding to an LZ4 compressed input.
+   * NOTE: Each instance must be closed to avoid a native memory leak.
+   */
+  public static final class Configuration implements AutoCloseable {
+    private final long metadataPtr;
+    private final long metadataSize;
+    private final long tempBytes;
+    private final long uncompressedBytes;
+
+    Configuration(long metadataPtr, long metadataSize, long tempBytes,
+                  long uncompressedBytes) {
+      this.metadataPtr = metadataPtr;
+      this.metadataSize = metadataSize;
+      this.tempBytes = tempBytes;
+      this.uncompressedBytes = uncompressedBytes;
+    }
+
+    /** Get the host address of the metadata */
+    public long getMetadataPtr() {
+      return metadataPtr;
+    }
+
+    /** Get the size of the metadata in bytes */
+    public long getMetadataSize() {
+      return metadataSize;
+    }
+
+    /** Get the size of the temporary buffer in bytes needed to decompress */
+    public long getTempBytes() {
+      return tempBytes;
+    }
+
+    /** Get the size of the uncompressed data in bytes */
+    public long getUncompressedBytes() {
+      return uncompressedBytes;
+    }
+
+    @Override
+    public void close() {
+      NvcompJni.lz4DestroyMetadata(metadataPtr);
+    }
+  }
+
+  /**
+   * Determine if a buffer is data compressed with LZ4.
+   * @param buffer data to examine
+   * @param stream CUDA stream to use
+   * @return true if the data is LZ4 compressed
+   */
+  public static boolean isLZ4Data(BaseDeviceMemoryBuffer buffer, Cuda.Stream stream) {
+    return NvcompJni.isLZ4Data(buffer.getAddress(), buffer.getLength(), stream.getStream());
+  }
+
+  /**
+   * Get the decompression configuration from compressed data.
+   * NOTE: The resulting configuration object must be closed to avoid a native memory leak.
+   * @param compressed data that has been compressed by the LZ4 compressor
+   * @param stream CUDA stream to use
+   * @return decompression configuration for the specified input
+   */
+  public static Configuration configure(BaseDeviceMemoryBuffer compressed, Cuda.Stream stream) {
+    long[] configs = NvcompJni.lz4DecompressConfigure(compressed.getAddress(),
+        compressed.getLength(), stream.getStream());
+    assert configs.length == 4;
+    return new Configuration(configs[0], configs[1], configs[2], configs[3]);
+  }
+
+  /**
+   * Asynchronously decompress data compressed with the LZ4 compressor.
+   * @param compressed buffer containing LZ4-compressed data
+   * @param config decompression configuration
+   * @param temp temporary storage buffer
+   * @param outputBuffer buffer that will be written with the uncompressed output
+   * @param stream CUDA stream to use
+   */
+  public static void decompressAsync(
+      BaseDeviceMemoryBuffer compressed,
+      Configuration config,
+      BaseDeviceMemoryBuffer temp,
+      BaseDeviceMemoryBuffer outputBuffer,
+      Cuda.Stream stream) {
+    NvcompJni.lz4DecompressAsync(
+        compressed.getAddress(),
+        compressed.getLength(),
+        config.getMetadataPtr(),
+        config.getMetadataSize(),
+        temp.getAddress(),
+        temp.getLength(),
+        outputBuffer.getAddress(),
+        outputBuffer.getLength(),
+        stream.getStream());
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 5ce0a8d815d..58f8390d0eb 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,69 +24,14 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
-  /**
-   * Extracts the metadata from the input on the device and copies
-   * it to the host. Note that the result must be released with a
-   * call to decompressDestroyMetadata
-   * @param inPtr device address of the compressed data
-   * @param inSize size of the compressed data in bytes
-   * @param stream address of CUDA stream that will be used for synchronization
-   * @return address of the metadata on the host
-   */
-  static native long decompressGetMetadata(long inPtr, long inSize, long stream);
-
-  /**
-   * Destroys the metadata object and frees the associated memory.
-   * @param metadataPtr address of the metadata object
-   */
-  static native void decompressDestroyMetadata(long metadataPtr);
-
-  /**
-   * Computes the temporary storage size needed to decompress.
-   * This over-estimates the needed storage considerably.
-   * @param metadataPtr address of the metadata object
-   * @return the number of temporary storage bytes needed to decompress
-   */
-  static native long decompressGetTempSize(long metadataPtr);
-
-  /**
-   * Computes the decompressed size of the data.  Gets this from the
-   * metadata contained in the compressed data.
-   * @param metadataPtr address of the metadata object
-   * @return the size of the decompressed data in bytes
-   */
-  static native long decompressGetOutputSize(long metadataPtr);
-
-  /**
-   * Perform asynchronous decompression using the specified CUDA stream.
-   * The input, temporary, and output buffers must all be in GPU-accessible
-   * memory.
-   * @param inPtr device address of the compressed buffer
-   * @param inSize size of the compressed data in bytes
-   * @param tempPtr device address of the temporary decompression storage buffer
-   * @param tempSize size of the temporary decompression storage buffer
-   * @param metadataPtr address of the metadata object
-   * @param outPtr device address of the buffer to use for uncompressed output
-   * @param outSize size of the uncompressed output buffer in bytes
-   * @param stream CUDA stream to use
-   */
-  static native void decompressAsync(
-      long inPtr,
-      long inSize,
-      long tempPtr,
-      long tempSize,
-      long metadataPtr,
-      long outPtr,
-      long outSize,
-      long stream);
-
   /**
    * Determine if data is compressed with the nvcomp LZ4 compressor.
    * @param inPtr device address of the compressed data
    * @param inSize size of the compressed data in bytes
+   * @param stream CUDA stream to use
    * @return true if the data is compressed with the nvcomp LZ4 compressor
    */
-  static native boolean isLZ4Data(long inPtr, long inSize);
+  static native boolean isLZ4Data(long inPtr, long inSize, long stream);
 
   /**
    * Determine if the metadata corresponds to data compressed with the nvcomp LZ4 compressor.
@@ -96,45 +41,21 @@ static native void decompressAsync(
   static native boolean isLZ4Metadata(long metadataPtr);
 
   /**
-   * Calculate the temporary buffer size needed for LZ4 compression.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @return number of temporary storage bytes needed to compress
+   * Return the LZ4 compression configuration necessary for a particular chunk size.
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param uncompressedSize total size of the uncompressed data
+   * @return array of three longs containing metadata size, temp storage size,
+   *         and output buffer size
    */
-  static native long lz4CompressGetTempSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      long chunkSize);
+  static native long[] lz4CompressConfigure(long chunkSize, long uncompressedSize);
 
   /**
-   * Calculate the output buffer size for LZ4 compression. The output
-   * size can be an estimated upper bound or the exact value.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param computeExactSize set to true to compute the exact output size
-   * @return output buffer size in bytes. If computeExactSize is true then
-   * this is an exact size otherwise it is a maximum, worst-case size of the
-   * compressed data.
-   */
-  static native long lz4CompressGetOutputSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      long chunkSize,
-      long tempPtr,
-      long tempSize,
-      boolean computeExactSize);
-
-  /**
-   * Perform LZ4 compression synchronously using the specified CUDA
-   * stream.
+   * Perform LZ4 compression asynchronously using the specified CUDA stream.
+   * @param compressedSizeOutputPtr host address of a 64-bit integer to update
+   *                                with the resulting compressed size of the
+   *                                data. For the operation to be truly
+   *                                asynchronous this should point to pinned
+   *                                host memory.
    * @param inPtr device address of the uncompressed data
    * @param inSize size of the uncompressed data in bytes
    * @param inputType type of uncompressed data
@@ -144,9 +65,9 @@ static native long lz4CompressGetOutputSize(
    * @param outPtr device address of the output buffer
    * @param outSize size of the output buffer in bytes
    * @param stream CUDA stream to use
-   * @return size of the compressed output in bytes
    */
-  static native long lz4Compress(
+  static native void lz4CompressAsync(
+      long compressedSizeOutputPtr,
       long inPtr,
       long inSize,
       int inputType,
@@ -158,29 +79,33 @@ static native long lz4Compress(
       long stream);
 
   /**
-   * Perform LZ4 compression synchronously using the specified CUDA
-   * stream.
-   * @param compressedSizeOutputPtr host address of a 64-bit integer to update
-   *                                with the resulting compressed size of the
-   *                                data. For the operation to be truly
-   *                                asynchronous this should point to pinned
-   *                                host memory.
+   * Return the decompression configuration for a compressed input.
+   * NOTE: The resulting configuration object must be closed to destroy the corresponding
+   * host-side metadata created by this method to avoid a native memory leak.
+   * @param inPtr device address of the compressed data
+   * @param inSize size of the compressed data
+   * @return array of four longs containing metadata address, metadata size, temp storage size,
+   *         and output buffer size
+   */
+  static native long[] lz4DecompressConfigure(long inPtr, long inSize, long stream);
+
+  /**
+   * Perform LZ4 decompression asynchronously using the specified CUDA stream.
    * @param inPtr device address of the uncompressed data
    * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param chunkSize size of an LZ4 chunk in bytes
+   * @param metadataPtr host address of the metadata
+   * @param metadataSize size of the metadata in bytes
    * @param tempPtr device address of the temporary compression storage buffer
    * @param tempSize size of the temporary storage buffer in bytes
    * @param outPtr device address of the output buffer
    * @param outSize size of the output buffer in bytes
    * @param stream CUDA stream to use
    */
-  static native void lz4CompressAsync(
-      long compressedSizeOutputPtr,
+  static native void lz4DecompressAsync(
       long inPtr,
       long inSize,
-      int inputType,
-      long chunkSize,
+      long metadataPtr,
+      long metadataSize,
       long tempPtr,
       long tempSize,
       long outPtr,
@@ -188,229 +113,99 @@ static native void lz4CompressAsync(
       long stream);
 
   /**
-   * Extracts the metadata from the batch of inputs on the device and
-   * copies them to the host. This synchronizes on the stream.
-   * @param inPtrs device addresses of the compressed buffers in the batch
-   * @param inSizes corresponding byte sizes of the compressed buffers
-   * @param stream CUDA stream to use
-   * @return handle to the batched decompress metadata on the host
-   */
-  static native long batchedLZ4DecompressGetMetadata(
-      long[] inPtrs,
-      long[] inSizes,
-      long stream);
-
-  /**
-   * Destroys batched metadata and frees the underlying host memory.
-   * @param batchedMetadata handle to the batched decompress metadata
+   * Destroy host-side metadata created by {@link NvcompJni#lz4DecompressConfigure(long, long, long)}
+   * @param metadataPtr host address of metadata
    */
-  static native void batchedLZ4DecompressDestroyMetadata(long batchedMetadata);
+  static native void lz4DestroyMetadata(long metadataPtr);
 
   /**
-   * Computes the temporary storage size in bytes needed to decompress
-   * the compressed batch.
-   * @param batchedMetadata handle to the batched metadata
-   * @return number of temporary storage bytes needed to decompress the batch
-   */
-  static native long batchedLZ4DecompressGetTempSize(long batchedMetadata);
-
-  /**
-   * Computes the decompressed size of each chunk in the batch.
-   * @param batchedMetadata handle to the batched metadata
-   * @param numOutputs number of output buffers in the batch
-   * @return Array of corresponding output sizes needed to decompress
-   * each buffer in the batch.
-   */
-  static native long[] batchedLZ4DecompressGetOutputSize(
-      long batchedMetadata,
-      long numOutputs);
-
-  /**
-   * Asynchronously decompress a batch of compressed data buffers.
-   * @param inPtrs device addresses of the compressed buffers
-   * @param inSizes corresponding byte sizes of the compressed buffers
-   * @param tempPtr device address of the temporary decompression space
-   * @param tempSize size of the temporary decompression space in bytes
-   * @param batchedMetadata handle to the batched metadata
-   * @param outPtrs device addresses of the uncompressed output buffers
-   * @param outSizes corresponding byte sizes of the uncompressed output buffers
-   * @param stream CUDA stream to use
-   */
-  static native void batchedLZ4DecompressAsync(
-      long[] inPtrs,
-      long[] inSizes,
-      long tempPtr,
-      long tempSize,
-      long batchedMetadata,
-      long[] outPtrs,
-      long[] outSizes,
-      long stream);
-
-  /**
-   * Get the temporary workspace size required to perform compression of entire batch.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
+   * Get the temporary workspace size required to perform compression of entire LZ4 batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
    * @return The size of required temporary workspace in bytes to compress the batch.
    */
-  static native long batchedLZ4CompressGetTempSize(
-      long[] inPtrs,
-      long[] inSizes,
-      long chunkSize);
+  static native long batchedLZ4CompressGetTempSize(long batchSize, long maxChunkSize);
 
   /**
-   * Get the required output sizes of each chunk to perform compression.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary workspace buffer
-   * @param tempSize size of the temporary workspace buffer in bytes
-   * @return array of corresponding sizes in bytes of the output buffers needed
-   * to compress the buffers in the batch.
+   * Get the maximum size any chunk could compress to in a LZ4 batch. This is the minimum amount of
+   * output memory to allocate per chunk when batch compressing.
+   * @param maxChunkSize maximum size of an uncompressed chunk size in bytes
+   * @return maximum compressed output size of a chunk
    */
-  static native long[] batchedLZ4CompressGetOutputSize(
-      long[] inPtrs,
-      long[] inSizes,
-      long chunkSize,
-      long tempPtr,
-      long tempSize);
+  static native long batchedLZ4CompressGetMaxOutputChunkSize(long maxChunkSize);
 
   /**
-   * Asynchronously compress a batch of buffers. Note that
+   * Asynchronously compress a batch of buffers with LZ4. Note that
    * compressedSizesOutPtr must point to pinned memory for this operation
    * to be asynchronous.
-   * @param compressedSizesOutPtr host address where to write the sizes of the
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
    *                              compressed data written to the corresponding
    *                              output buffers. Must point to a buffer with
    *                              at least 8 bytes of memory per output buffer
-   *                              in the batch. For asynchronous operation
-   *                              this must point to pinned host memory.
-   * @param inPtrs device addresses of the uncompressed buffers
-   * @param inSizes corresponding byte sizes of the uncompressed buffers
-   * @param chunkSize size of an LZ4 chunk in bytes
-   * @param tempPtr device address of the temporary workspace buffer
-   * @param tempSize size of the temporary workspace buffer in bytes
-   * @param outPtrs device addresses of the output compressed buffers
-   * @param outSizes corresponding sizes in bytes of the output buffers
+   *                              in the batch.
    * @param stream CUDA stream to use
    */
   static native void batchedLZ4CompressAsync(
-      long compressedSizesOutPtr,
-      long[] inPtrs,
-      long[] inSizes,
+      long devInPtrs,
+      long devInSizes,
       long chunkSize,
+      long batchSize,
       long tempPtr,
       long tempSize,
-      long[] outPtrs,
-      long[] outSizes,
+      long devOutPtrs,
+      long compressedSizesOutPtr,
       long stream);
 
   /**
-   * Calculate the temporary buffer size needed for cascaded compression.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @return number of temporary storage bytes needed to compress
-   */
-  static native long cascadedCompressGetTempSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking);
-
-  /**
-   * Calculate the output buffer size for cascaded compression. The output
-   * size can be an estimated upper bound or the exact value.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param computeExactSize set to true to compute the exact output size
-   * @return output buffer size in bytes. If computeExactSize is true then
-   * this is an exact size otherwise it is a maximum, worst-case size of the
-   * compressed data.
+   * Computes the temporary storage size in bytes needed to decompress a LZ4-compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
    */
-  static native long cascadedCompressGetOutputSize(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
-      long tempPtr,
-      long tempSize,
-      boolean computeExactSize);
+  static native long batchedLZ4DecompressGetTempSize(
+      long numChunks,
+      long maxUncompressedChunkBytes);
 
   /**
-   * Perform cascaded compression synchronously using the specified CUDA
-   * stream.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
+   * Asynchronously decompress a batch of LZ4-compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
    * @param stream CUDA stream to use
-   * @return size of the compressed output in bytes
    */
-  static native long cascadedCompress(
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
+  static native void batchedLZ4DecompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
       long tempPtr,
       long tempSize,
-      long outPtr,
-      long outSize,
+      long devOutPtrs,
       long stream);
 
   /**
-   * Perform cascaded compression asynchronously using the specified CUDA
-   * stream. Note if the compressedSizeOutputPtr argument points to paged
-   * memory then this may synchronize in practice due to limitations of
-   * copying from the device to paged memory.
-   * @param compressedSizeOutputPtr address of a 64-bit integer to update with
-   *                                the resulting compressed size of the data.
-   *                                For the operation to be truly asynchronous
-   *                                this should point to pinned host memory.
-   * @param inPtr device address of the uncompressed data
-   * @param inSize size of the uncompressed data in bytes
-   * @param inputType type of uncompressed data
-   * @param numRLEs number of Run Length Encoding layers to use
-   * @param numDeltas number of delta layers to use
-   * @param useBitPacking set to true if bit-packing should be used
-   * @param tempPtr device address of the temporary compression storage buffer
-   * @param tempSize size of the temporary storage buffer in bytes
-   * @param outPtr device address of the output buffer
-   * @param outSize size of the output buffer in bytes
+   * Asynchronously calculates the decompressed size needed for each chunk.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of calculated decompress sizes vector
+   * @param batchSize number of buffers in the batch
    * @param stream CUDA stream to use
    */
-  static native void cascadedCompressAsync(
-      long compressedSizeOutputPtr,
-      long inPtr,
-      long inSize,
-      int inputType,
-      int numRLEs,
-      int numDeltas,
-      boolean useBitPacking,
-      long tempPtr,
-      long tempSize,
-      long outPtr,
-      long outSize,
+  static native void batchedLZ4GetDecompressSizeAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
       long stream);
 }
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 2c95c6eebac..f8840033733 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,22 +1,21 @@
-#=============================================================================
+# =============================================================================
 # Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+# http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
 cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR)
 
 file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-21.12/RAPIDS.cmake
-     ${CMAKE_BINARY_DIR}/RAPIDS.cmake)
+     ${CMAKE_BINARY_DIR}/RAPIDS.cmake
+)
 include(${CMAKE_BINARY_DIR}/RAPIDS.cmake)
 
 include(rapids-cmake)
@@ -29,10 +28,14 @@ if(DEFINED GPU_ARCHS)
 endif()
 rapids_cuda_init_architectures(CUDF_JNI)
 
-project(CUDF_JNI VERSION 21.12.00 LANGUAGES C CXX CUDA)
+project(
+  CUDF_JNI
+  VERSION 21.12.00
+  LANGUAGES C CXX CUDA
+)
 
-###################################################################################################
-# - build options ---------------------------------------------------------------------------------
+# ##################################################################################################
+# * build options ---------------------------------------------------------------------------------
 
 option(USE_NVTX "Build with NVTX support" ON)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
@@ -49,19 +52,18 @@ message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
 message(VERBOSE "CUDF_JNI: Build with static Arrow library: ${CUDF_JNI_ARROW_STATIC}")
 
 set(CUDF_SOURCE_DIR "${PROJECT_SOURCE_DIR}/../../../../cpp")
-if (DEFINED ENV{CUDF_CPP_BUILD_DIR}) 
+if(DEFINED ENV{CUDF_CPP_BUILD_DIR})
   set(CUDF_CPP_BUILD_DIR "$ENV{CUDF_CPP_BUILD_DIR}")
 else()
   set(CUDF_CPP_BUILD_DIR "${CUDF_SOURCE_DIR}/build")
 endif()
 
-set(CMAKE_MODULE_PATH
-    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/"
-    "${CUDF_SOURCE_DIR}/cmake/Modules/"
-    ${CMAKE_MODULE_PATH})
+set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/"
+                      "${CUDF_SOURCE_DIR}/cmake/Modules/" ${CMAKE_MODULE_PATH}
+)
 
-###################################################################################################
-# - compiler options ------------------------------------------------------------------------------
+# ##################################################################################################
+# * compiler options ------------------------------------------------------------------------------
 
 set(CUDF_CXX_FLAGS "")
 set(CUDF_CUDA_FLAGS "")
@@ -73,80 +75,80 @@ include(ConfigureCUDA) # set other CUDA compilation flags
 
 # Disable NVTX if necessary
 if(NOT USE_NVTX)
-    target_compile_definitions(cudfjni PUBLIC NVTX_DISABLE)
+  target_compile_definitions(cudfjni PUBLIC NVTX_DISABLE)
 endif()
 
 if(PER_THREAD_DEFAULT_STREAM)
-    message(STATUS "Using per-thread default stream")
-    add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+  message(STATUS "Using per-thread default stream")
+  add_compile_definitions(CUDA_API_PER_THREAD_DEFAULT_STREAM)
 endif()
 
-###################################################################################################
-# - build type ------------------------------------------------------------------------------------
-# Set a default build type if none was specified
+# ##################################################################################################
+# * build type ------------------------------------------------------------------------------------
+#   Set a default build type if none was specified
 rapids_cmake_build_type("Release")
 
-###################################################################################################
-# - Thrust/CUB/libcudacxx ------------------------------------------------------------------------------------
-find_path(THRUST_INCLUDE "thrust"
-    HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src"
-          "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src"
-          "$ENV{CONDA_PREFIX}/include")
+# ##################################################################################################
+# * Thrust/CUB/libcudacxx
+# ------------------------------------------------------------------------------------
+find_path(
+  THRUST_INCLUDE "thrust"
+  HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src" "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src"
+        "$ENV{CONDA_PREFIX}/include"
+)
 
 message(STATUS "THRUST: THRUST_INCLUDE set to ${THRUST_INCLUDE}")
 
-find_path(CUB_INCLUDE "cub"
-    HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src"
-          "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src"
-          "$ENV{CONDA_PREFIX}/include")
+find_path(
+  CUB_INCLUDE "cub" HINTS "$ENV{CUDF_ROOT}/_deps/thrust-src"
+                          "${CUDF_CPP_BUILD_DIR}/_deps/thrust-src" "$ENV{CONDA_PREFIX}/include"
+)
 
 message(STATUS "CUB: CUB_INCLUDE set to ${CUB_INCLUDE}")
 
-find_path(LIBCUDACXX_INCLUDE "cuda"
-    HINTS "$ENV{CUDF_ROOT}/_deps/libcudacxx-src/include"
-          "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include")
+find_path(LIBCUDACXX_INCLUDE "cuda" HINTS "$ENV{CUDF_ROOT}/_deps/libcudacxx-src/include"
+                                          "${CUDF_CPP_BUILD_DIR}/_deps/libcudacxx-src/include"
+)
 
 message(STATUS "LIBCUDACXX: LIBCUDACXX_INCLUDE set to ${LIBCUDACXX_INCLUDE}")
 
-find_path(SPDLOG_INCLUDE "spdlog"
-    HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include"
-          "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
-          "$ENV{RMM_ROOT}/include"
-          "$ENV{CONDA_PREFIX}/include")
+find_path(
+  SPDLOG_INCLUDE "spdlog"
+  HINTS "${CUDF_CPP_BUILD_DIR}/_deps/spdlog-src/include" "$ENV{RMM_ROOT}/_deps/spdlog-src/include"
+        "$ENV{RMM_ROOT}/include" "$ENV{CONDA_PREFIX}/include"
+)
 
 message(STATUS "SPDLOG: SPDLOG_INCLUDE set to ${SPDLOG_INCLUDE}")
-###################################################################################################
-# - CUDF ------------------------------------------------------------------------------------------
+# ##################################################################################################
+# * CUDF ------------------------------------------------------------------------------------------
 
 set(CUDF_INCLUDE "${PROJECT_SOURCE_DIR}/../../../../cpp/include"
-                 "${PROJECT_SOURCE_DIR}/../../../../cpp/src/")
+                 "${PROJECT_SOURCE_DIR}/../../../../cpp/src/"
+)
 
-set(CUDF_LIB_HINTS
-    HINTS "$ENV{CUDF_ROOT}"
-          "$ENV{CUDF_ROOT}/lib"
-          "$ENV{CONDA_PREFIX}/lib"
-          "${CUDF_CPP_BUILD_DIR}")
+set(CUDF_LIB_HINTS HINTS "$ENV{CUDF_ROOT}" "$ENV{CUDF_ROOT}/lib" "$ENV{CONDA_PREFIX}/lib"
+                   "${CUDF_CPP_BUILD_DIR}"
+)
 
 find_library(CUDF_LIB "cudf" REQUIRED HINTS ${CUDF_LIB_HINTS})
 
-###################################################################################################
-# - RMM -------------------------------------------------------------------------------------------
+# ##################################################################################################
+# * RMM -------------------------------------------------------------------------------------------
 
-find_path(RMM_INCLUDE "rmm"
-          HINTS "${CUDF_CPP_BUILD_DIR}/_deps/rmm-src/include"
-                "$ENV{RMM_ROOT}/include"
-                "$ENV{RMM_HOME}/include"
-                "$ENV{CONDA_PREFIX}/include/rmm"
-                "$ENV{CONDA_PREFIX}/include")
+find_path(
+  RMM_INCLUDE "rmm"
+  HINTS "${CUDF_CPP_BUILD_DIR}/_deps/rmm-src/include" "$ENV{RMM_ROOT}/include"
+        "$ENV{RMM_HOME}/include" "$ENV{CONDA_PREFIX}/include/rmm" "$ENV{CONDA_PREFIX}/include"
+)
 
 message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
 
-###################################################################################################
-# - ARROW -----------------------------------------------------------------------------------------
+# ##################################################################################################
+# * ARROW -----------------------------------------------------------------------------------------
 
-find_path(ARROW_INCLUDE "arrow"
-          HINTS "$ENV{ARROW_ROOT}/include"
-                "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src")
+find_path(ARROW_INCLUDE "arrow" HINTS "$ENV{ARROW_ROOT}/include"
+                                      "${CUDF_CPP_BUILD_DIR}/_deps/arrow-src/cpp/src"
+)
 
 message(STATUS "ARROW: ARROW_INCLUDE set to ${ARROW_INCLUDE}")
 
@@ -157,14 +159,17 @@ else()
   set(CUDF_JNI_ARROW_LIBNAME "arrow")
 endif()
 
-find_library(ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
-  HINTS "$ENV{ARROW_ROOT}/lib"
-        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release"
-        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug")
+find_library(
+  ARROW_LIBRARY ${CUDF_JNI_ARROW_LIBNAME} REQUIRED
+  HINTS "$ENV{ARROW_ROOT}/lib" "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/release"
+        "${CUDF_CPP_BUILD_DIR}/_deps/arrow-build/debug"
+)
 
 if(NOT ARROW_LIBRARY)
   if(CUDF_JNI_ARROW_STATIC)
-    message(FATAL_ERROR "Arrow static library not found. Was libcudf built with CUDF_USE_ARROW_STATIC=ON?")
+    message(
+      FATAL_ERROR "Arrow static library not found. Was libcudf built with CUDF_USE_ARROW_STATIC=ON?"
+    )
   else()
     message(FATAL_ERROR "Arrow dynamic library not found.")
   endif()
@@ -172,137 +177,154 @@ else()
   message(STATUS "ARROW: ARROW_LIBRARY set to ${ARROW_LIBRARY}")
 endif()
 
-###################################################################################################
-# - find JNI -------------------------------------------------------------------------------------
+# ##################################################################################################
+# * find JNI -------------------------------------------------------------------------------------
 find_package(JNI REQUIRED)
 if(JNI_FOUND)
-    message(STATUS "JDK with JNI in ${JNI_INCLUDE_DIRS}")
+  message(STATUS "JDK with JNI in ${JNI_INCLUDE_DIRS}")
 else()
-    message(FATAL_ERROR "JDK with JNI not found, please check your settings.")
+  message(FATAL_ERROR "JDK with JNI not found, please check your settings.")
 endif()
 
-###################################################################################################
-# - nvcomp ----------------------------------------------------------------------------------------
+# ##################################################################################################
+# * nvcomp ----------------------------------------------------------------------------------------
 
-include(ConfigureNvcomp)
-if(NVCOMP_FOUND)
-    message(STATUS "nvcomp compression library found in ${NVCOMP_ROOT}")
+find_path(NVCOMP_INCLUDE "nvcomp" HINTS "${CUDF_CPP_BUILD_DIR}/_deps/nvcomp-src/include"
+                                        "$ENV{CONDA_PREFIX}/include"
+)
+
+message(STATUS "NVCOMP: NVCOMP_INCLUDE set to ${NVCOMP_INCLUDE}")
+
+set(CUDF_JNI_NVCOMP_LIBNAME "libnvcomp.a")
+find_library(
+  NVCOMP_LIBRARY ${CUDF_JNI_NVCOMP_LIBNAME} REQUIRED HINTS "${CUDF_CPP_BUILD_DIR}/lib"
+                                                           "$ENV{CONDA_PREFIX}/lib"
+)
+
+if(NOT NVCOMP_LIBRARY)
+  message(FATAL_ERROR "nvcomp static library not found.")
 else()
-    message(FATAL_ERROR "nvcomp compression library not found.")
+  message(STATUS "NVCOMP: NVCOMP_LIBRARY set to ${NVCOMP_LIBRARY}")
 endif()
 
-###################################################################################################
-# - GDS/cufile ------------------------------------------------------------------------------------
+# ##################################################################################################
+# * GDS/cufile ------------------------------------------------------------------------------------
 
 if(USE_GDS)
-    message(STATUS "Building with GPUDirect Storage (GDS)/cuFile support")
-    find_package(cuFile REQUIRED)
+  message(STATUS "Building with GPUDirect Storage (GDS)/cuFile support")
+  find_package(cuFile REQUIRED)
 endif()
 
-###################################################################################################
-# - library targets -------------------------------------------------------------------------------
-
-add_library(cudfjni SHARED
-    src/row_conversion.cu
-    src/AggregationJni.cpp
-    src/CudfJni.cpp
-    src/CudaJni.cpp
-    src/ColumnVectorJni.cpp
-    src/ColumnViewJni.cpp
-    src/CompiledExpression.cpp
-    src/ContiguousTableJni.cpp
-    src/HashJoinJni.cpp
-    src/HostMemoryBufferNativeUtilsJni.cpp
-    src/NvcompJni.cpp
-    src/NvtxRangeJni.cpp
-    src/RmmJni.cpp
-    src/ScalarJni.cpp
-    src/TableJni.cpp
-    src/map_lookup.cu)
-
-###################################################################################################
-# - include paths ---------------------------------------------------------------------------------
-
-target_include_directories(cudfjni
-  PUBLIC
-      "${THRUST_INCLUDE}"
-      "${CUB_INCLUDE}"
-      "${LIBCUDACXX_INCLUDE}"
-      "${CUDAToolkit_INCLUDE_DIRS}"
-      "${NVCOMP_INCLUDE_DIR}"
-      "${CMAKE_BINARY_DIR}/include"
-      "${CMAKE_SOURCE_DIR}/include"
-      "${SPDLOG_INCLUDE}"
-      "${CMAKE_SOURCE_DIR}/src"
-      "${JNI_INCLUDE_DIRS}"
-      "${CUDF_INCLUDE}"
-      "${RMM_INCLUDE}"
-      "${ARROW_INCLUDE}")
-
-###################################################################################################
-# - compile options ---------------------------------------------------------------------------------
-
-#Override RPATH for cudfjni
-set_target_properties(cudfjni
-        PROPERTIES BUILD_RPATH                    "\$ORIGIN"
-               INSTALL_RPATH                      "\$ORIGIN"
-               # set target compile options
-               CXX_STANDARD                        17
-               CXX_STANDARD_REQUIRED               ON
-               CUDA_STANDARD                       17
-               CUDA_STANDARD_REQUIRED              ON
+# ##################################################################################################
+# * library targets -------------------------------------------------------------------------------
+
+add_library(
+  cudfjni SHARED
+  src/row_conversion.cu
+  src/AggregationJni.cpp
+  src/CudfJni.cpp
+  src/CudaJni.cpp
+  src/ColumnVectorJni.cpp
+  src/ColumnViewJni.cpp
+  src/CompiledExpression.cpp
+  src/ContiguousTableJni.cpp
+  src/HashJoinJni.cpp
+  src/HostMemoryBufferNativeUtilsJni.cpp
+  src/NvcompJni.cpp
+  src/NvtxRangeJni.cpp
+  src/NvtxUniqueRangeJni.cpp
+  src/RmmJni.cpp
+  src/ScalarJni.cpp
+  src/TableJni.cpp
+  src/map_lookup.cu
+  src/check_nvcomp_output_sizes.cu
 )
 
-target_compile_options(cudfjni
-            PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
-                    "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+# ##################################################################################################
+# * include paths ---------------------------------------------------------------------------------
+
+target_include_directories(
+  cudfjni
+  PUBLIC "${THRUST_INCLUDE}"
+         "${CUB_INCLUDE}"
+         "${LIBCUDACXX_INCLUDE}"
+         "${CUDAToolkit_INCLUDE_DIRS}"
+         "${NVCOMP_INCLUDE}"
+         "${CMAKE_BINARY_DIR}/include"
+         "${CMAKE_SOURCE_DIR}/include"
+         "${SPDLOG_INCLUDE}"
+         "${CMAKE_SOURCE_DIR}/src"
+         "${JNI_INCLUDE_DIRS}"
+         "${CUDF_INCLUDE}"
+         "${RMM_INCLUDE}"
+         "${ARROW_INCLUDE}"
 )
 
-target_compile_definitions(cudfjni
-            PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
-                   "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
+# ##################################################################################################
+# * compile options
+# ---------------------------------------------------------------------------------
+
+# Override RPATH for cudfjni
+set_target_properties(
+  cudfjni
+  PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options
+             CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON
+)
+
+target_compile_options(
+  cudfjni PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                  "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
+
+target_compile_definitions(
+  cudfjni PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
+                 "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
 )
 
 if(USE_GDS)
-    add_library(cufilejni SHARED src/CuFileJni.cpp)
-    set_target_properties(cufilejni
-        PROPERTIES BUILD_RPATH                        "\$ORIGIN"
-                   INSTALL_RPATH                      "\$ORIGIN"
-                   # set target compile options
-                   CXX_STANDARD                        17
-                   CXX_STANDARD_REQUIRED               ON
-    )
-    target_include_directories(cufilejni
-        PUBLIC  "${LIBCUDACXX_INCLUDE}"
-                "${CUDF_INCLUDE}"
-        PRIVATE "${cuFile_INCLUDE_DIRS}")
-    target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
+  add_library(cufilejni SHARED src/CuFileJni.cpp)
+  set_target_properties(
+    cufilejni
+    PROPERTIES BUILD_RPATH "\$ORIGIN" INSTALL_RPATH "\$ORIGIN" # set target compile options
+               CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON
+  )
+  target_include_directories(
+    cufilejni
+    PUBLIC "${LIBCUDACXX_INCLUDE}" "${CUDF_INCLUDE}"
+    PRIVATE "${cuFile_INCLUDE_DIRS}"
+  )
+  target_link_libraries(cufilejni PRIVATE cudfjni "${cuFile_LIBRARIES}")
 endif()
 
-###################################################################################################
-# - rmm logging level -----------------------------------------------------------------------------
+# ##################################################################################################
+# * rmm logging level -----------------------------------------------------------------------------
 
-set(RMM_LOGGING_LEVEL "INFO" CACHE STRING "Choose the logging level.")
+set(RMM_LOGGING_LEVEL
+    "INFO"
+    CACHE STRING "Choose the logging level."
+)
 # Set the possible values of build type for cmake-gui
-set_property(CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS
-        "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF")
+set_property(
+  CACHE RMM_LOGGING_LEVEL PROPERTY STRINGS "TRACE" "DEBUG" "INFO" "WARN" "ERROR" "CRITICAL" "OFF"
+)
 message(STATUS "RMM_LOGGING_LEVEL = '${RMM_LOGGING_LEVEL}'.")
 
 target_compile_definitions(cudfjni PUBLIC SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${RMM_LOGGING_LEVEL})
 
-###################################################################################################
-# - link libraries --------------------------------------------------------------------------------
+# ##################################################################################################
+# * link libraries --------------------------------------------------------------------------------
 
-target_link_libraries(cudfjni PRIVATE nvcomp ${CUDF_LIB} ${ARROW_LIBRARY})
+target_link_libraries(cudfjni PRIVATE ${NVCOMP_LIBRARY} ${CUDF_LIB} ${ARROW_LIBRARY})
 
-###################################################################################################
-# - cudart options --------------------------------------------------------------------------------
-# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
+# ##################################################################################################
+# * cudart options --------------------------------------------------------------------------------
+#   cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic
+#   linking
 
 if(CUDA_STATIC_RUNTIME)
-    # Tell CMake what CUDA language runtime to use
-    set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Static)
+  # Tell CMake what CUDA language runtime to use
+  set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Static)
 else()
-    # Tell CMake what CUDA language runtime to use
-    set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
+  # Tell CMake what CUDA language runtime to use
+  set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
 endif()
diff --git a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake b/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake
deleted file mode 100644
index 1a0083e4518..00000000000
--- a/java/src/main/native/cmake/Modules/ConfigureNvcomp.cmake
+++ /dev/null
@@ -1,79 +0,0 @@
-#=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-set(NVCOMP_ROOT "${CMAKE_BINARY_DIR}/nvcomp")
-
-if(CUDA_STATIC_RUNTIME)
-  set(NVCOMP_CUDA_RUNTIME_LIBRARY Static)
-else()
-  set(NVCOMP_CUDA_RUNTIME_LIBRARY Shared)
-endif()
-
-set(NVCOMP_CMAKE_ARGS "-DCMAKE_CUDA_RUNTIME_LIBRARY=${NVCOMP_CUDA_RUNTIME_LIBRARY} -DUSE_RMM=ON -DCUB_DIR=${CUB_INCLUDE}")
-
-configure_file("${CMAKE_SOURCE_DIR}/cmake/Templates/Nvcomp.CMakeLists.txt.cmake"
-               "${NVCOMP_ROOT}/CMakeLists.txt")
-
-file(MAKE_DIRECTORY "${NVCOMP_ROOT}/build")
-
-execute_process(COMMAND ${CMAKE_COMMAND} -G ${CMAKE_GENERATOR} .
-                RESULT_VARIABLE NVCOMP_CONFIG
-                WORKING_DIRECTORY ${NVCOMP_ROOT})
-
-if(NVCOMP_CONFIG)
-    message(FATAL_ERROR "Configuring nvcomp failed: " ${NVCOMP_CONFIG})
-endif()
-
-set(PARALLEL_BUILD -j)
-if($ENV{PARALLEL_LEVEL})
-    set(NUM_JOBS $ENV{PARALLEL_LEVEL})
-    set(PARALLEL_BUILD "${PARALLEL_BUILD}${NUM_JOBS}")
-endif()
-
-if(${NUM_JOBS})
-    if(${NUM_JOBS} EQUAL 1)
-        message(STATUS "NVCOMP BUILD: Enabling Sequential CMake build")
-    elseif(${NUM_JOBS} GREATER 1)
-        message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with ${NUM_JOBS} jobs")
-    endif()
-else()
-    message(STATUS "NVCOMP BUILD: Enabling Parallel CMake build with all threads")
-endif()
-
-execute_process(COMMAND ${CMAKE_COMMAND} --build .. -- ${PARALLEL_BUILD}
-                RESULT_VARIABLE NVCOMP_BUILD
-                WORKING_DIRECTORY ${NVCOMP_ROOT}/build)
-
-if(NVCOMP_BUILD)
-    message(FATAL_ERROR "Building nvcomp failed: " ${NVCOMP_BUILD})
-endif()
-
-message(STATUS "nvcomp build completed at: " ${NVCOMP_ROOT}/build)
-
-set(NVCOMP_INCLUDE_DIR "${NVCOMP_ROOT}/build/include")
-set(NVCOMP_LIBRARY_DIR "${NVCOMP_ROOT}/build/lib")
-
-find_library(NVCOMP_LIB nvcomp
-    NO_DEFAULT_PATH
-    HINTS "${NVCOMP_LIBRARY_DIR}")
-
-if(NVCOMP_LIB)
-    message(STATUS "nvcomp library: " ${NVCOMP_LIB})
-    set(NVCOMP_FOUND TRUE)
-
-    add_library(nvcomp STATIC IMPORTED)
-    set_target_properties(nvcomp PROPERTIES IMPORTED_LOCATION "${NVCOMP_LIB}")
-endif()
diff --git a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake b/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake
deleted file mode 100644
index 5761ef8fd1b..00000000000
--- a/java/src/main/native/cmake/Templates/Nvcomp.CMakeLists.txt.cmake
+++ /dev/null
@@ -1,33 +0,0 @@
-#=============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-cmake_minimum_required(VERSION 3.12)
-
-project(nvcomp)
-
-include(ExternalProject)
-
-ExternalProject_Add(nvcomp
-    GIT_REPOSITORY  https://github.com/NVIDIA/nvcomp.git
-    GIT_TAG         v1.2.1
-    GIT_SHALLOW     true
-    SOURCE_DIR      "${NVCOMP_ROOT}/nvcomp"
-    BINARY_DIR      "${NVCOMP_ROOT}/build"
-    INSTALL_DIR     "${NVCOMP_ROOT}/install"
-    PATCH_COMMAND   patch --reject-file=- -p1 -N < ${CMAKE_CURRENT_SOURCE_DIR}/cmake/nvcomp.patch || true
-    CMAKE_ARGS      ${NVCOMP_CMAKE_ARGS} -DCMAKE_INSTALL_PREFIX=${NVCOMP_ROOT}/install
-    BUILD_COMMAND   ${CMAKE_COMMAND} --build . --target nvcomp
-    INSTALL_COMMAND ${CMAKE_COMMAND} -E echo "Skipping nvcomp install step.")
diff --git a/java/src/main/native/cmake/nvcomp.patch b/java/src/main/native/cmake/nvcomp.patch
deleted file mode 100644
index ea1340b7754..00000000000
--- a/java/src/main/native/cmake/nvcomp.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
-index 32f48ef..a2e3125 100644
---- a/src/CMakeLists.txt
-+++ b/src/CMakeLists.txt
-@@ -10,7 +10,9 @@ endif()
- file(GLOB CUDA_SOURCES *.cu)
- file(GLOB CPP_SOURCES *.cpp)
- 
--add_library(nvcomp SHARED ${CUDA_SOURCES} ${CPP_SOURCES})
-+
-+add_library(nvcomp STATIC ${CUDA_SOURCES} ${CPP_SOURCES})
-+set_property(TARGET nvcomp PROPERTY POSITION_INDEPENDENT_CODE True)
- set_property(TARGET nvcomp PROPERTY CUDA_ARCHITECTURES ${GPU_ARCHS})
- target_compile_options(nvcomp PRIVATE
-     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda -Xcompiler -pthread>)
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 4b6696e3911..d7acaa679f6 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -21,6 +21,7 @@
 #include <jni.h>
 
 #include <cudf/utilities/error.hpp>
+#include <rmm/detail/error.hpp>
 
 namespace cudf {
 namespace jni {
@@ -327,7 +328,7 @@ template <typename T> class native_jpointerArray {
     return data()[index];
   }
 
-  T *const *data() const { return reinterpret_cast<T **>(wrapped.data()); }
+  T *const *data() const { return reinterpret_cast<T *const *>(wrapped.data()); }
 
   T **data() { return reinterpret_cast<T **>(wrapped.data()); }
 
@@ -741,11 +742,7 @@ inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
   }
 
 #define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
-  catch (const std::bad_alloc &e) {                                                                \
-    /* In some cases a cuda exception can be the cause so peek and clear if needed*/               \
-    if (cudaErrorMemoryAllocation == cudaPeekAtLastError()) {                                      \
-      cudaGetLastError();                                                                          \
-    }                                                                                              \
+  catch (const rmm::out_of_memory &e) {                                                            \
     auto what =                                                                                    \
         std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
     JNI_CHECK_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                         \
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index cd5ff073edd..f95b05d5aeb 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -61,7 +61,9 @@
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/transform.hpp>
+#include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <map_lookup.hpp>
@@ -295,14 +297,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *
   JNI_NULL_CHECK(env, input_column, "input_column native handle is null", 0);
   JNI_NULL_CHECK(env, percentiles_column, "percentiles_column native handle is null", 0);
   try {
-    cudf::jni::auto_set_device(env);
-    cudf::column_view *n_input_column = reinterpret_cast<cudf::column_view *>(input_column);
-    std::unique_ptr<cudf::structs_column_view> input_view =
-        std::make_unique<cudf::structs_column_view>(*n_input_column);
-    cudf::column_view *n_percentiles_column =
-        reinterpret_cast<cudf::column_view *>(percentiles_column);
-    std::unique_ptr<cudf::column> result =
-        cudf::percentile_approx(*input_view, *n_percentiles_column);
+    using namespace cudf;
+    using tdigest_column_view = cudf::tdigest::tdigest_column_view;
+    jni::auto_set_device(env);
+    auto const tdigest_view =
+        tdigest_column_view{structs_column_view{*reinterpret_cast<column_view *>(input_column)}};
+    auto const p_percentiles = reinterpret_cast<column_view *>(percentiles_column);
+    auto result = percentile_approx(tdigest_view, *p_percentiles);
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 4b378905a43..a18c88e10dc 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -144,6 +144,9 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
     case 20: return cudf::ast::ast_operator::RINT;
     case 21: return cudf::ast::ast_operator::BIT_INVERT;
     case 22: return cudf::ast::ast_operator::NOT;
+    case 23: return cudf::ast::ast_operator::CAST_TO_INT64;
+    case 24: return cudf::ast::ast_operator::CAST_TO_UINT64;
+    case 25: return cudf::ast::ast_operator::CAST_TO_FLOAT64;
     default: throw std::invalid_argument("unexpected JNI AST unary operator value");
   }
 }
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index 4f1239a8966..e548b4ce65c 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cuda_profiler_api.h>
 #include <rmm/device_buffer.hpp>
 
 #include "jni_utils.hpp"
@@ -348,4 +349,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env,
   CATCH_STD(env, );
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv *env, jclass clazz) {
+  try {
+    cudaProfilerStart();
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv *env, jclass clazz) {
+  try {
+    cudaProfilerStop();
+  }
+  CATCH_STD(env, );
+}
+
 } // extern "C"
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index 0e34d2856f5..533654baee1 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -13,11 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#include <cascaded.h>
-#include <lz4.h>
 #include <nvcomp.h>
 
+#include <nvcomp/lz4.h>
+#include <rmm/device_uvector.hpp>
+
+#include "check_nvcomp_output_sizes.hpp"
 #include "cudf_jni_apis.hpp"
 
 namespace {
@@ -27,7 +28,7 @@ constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCud
 constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
 constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException";
 
-void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
+void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
   switch (status) {
     case nvcompSuccess: break;
     case nvcompErrorInvalidValue:
@@ -36,9 +37,15 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
     case nvcompErrorNotSupported:
       cudf::jni::throw_java_exception(env, UNSUPPORTED_CLASS, "nvcomp unsupported");
       break;
+    case nvcompErrorCannotDecompress:
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp cannot decompress");
+      break;
     case nvcompErrorCudaError:
       cudf::jni::throw_java_exception(env, NVCOMP_CUDA_ERROR_CLASS, "nvcomp CUDA error");
       break;
+    case nvcompErrorInternal:
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp internal error");
+      break;
     default:
       cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "nvcomp unknown error");
       break;
@@ -49,74 +56,16 @@ void check_nvcomp_status(JNIEnv *env, nvcompError_t status) {
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetMetadata(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong jstream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    void *metadata_ptr;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompDecompressGetMetadata(reinterpret_cast<void *>(in_ptr), in_size,
-                                              &metadata_ptr, stream);
-    check_nvcomp_status(env, status);
-    return reinterpret_cast<jlong>(metadata_ptr);
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressDestroyMetadata(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompDecompressDestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetTempSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t temp_size;
-    auto status = nvcompDecompressGetTempSize(reinterpret_cast<void *>(metadata_ptr), &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressGetOutputSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t out_size;
-    auto status = nvcompDecompressGetOutputSize(reinterpret_cast<void *>(metadata_ptr), &out_size);
-    check_nvcomp_status(env, status);
-    return out_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_decompressAsync(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jlong temp_ptr, jlong temp_size,
-    jlong metadata_ptr, jlong out_ptr, jlong out_size, jlong jstream) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompDecompressAsync(reinterpret_cast<void *>(in_ptr), in_size,
-                                        reinterpret_cast<void *>(temp_ptr), temp_size,
-                                        reinterpret_cast<void *>(metadata_ptr),
-                                        reinterpret_cast<void *>(out_ptr), out_size, stream);
-    check_nvcomp_status(env, status);
-  }
-  CATCH_STD(env, );
-}
-
 JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Data(JNIEnv *env, jclass,
-                                                                          jlong in_ptr,
-                                                                          jlong in_size) {
+                                                                          jlong j_in_ptr,
+                                                                          jlong j_in_size,
+                                                                          jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    return LZ4IsData(reinterpret_cast<void *>(in_ptr), in_size);
+    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto in_size = static_cast<std::size_t>(j_in_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    return nvcompLZ4IsData(in_ptr, in_size, stream);
   }
   CATCH_STD(env, 0)
 }
@@ -125,370 +74,220 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_isLZ4Metadata(JN
                                                                               jlong metadata_ptr) {
   try {
     cudf::jni::auto_set_device(env);
-    return LZ4IsMetadata(reinterpret_cast<void *>(metadata_ptr));
+    return nvcompLZ4IsMetadata(reinterpret_cast<void *>(metadata_ptr));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetTempSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t temp_size;
-    auto status = nvcompLZ4CompressGetTempSize(reinterpret_cast<void *>(in_ptr), in_size, comp_type,
-                                               &opts, &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressGetOutputSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size,
-    jlong temp_ptr, jlong temp_size, jboolean compute_exact) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t out_size;
-    auto status = nvcompLZ4CompressGetOutputSize(
-        reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-        reinterpret_cast<void *>(temp_ptr), temp_size, &out_size, compute_exact);
-    check_nvcomp_status(env, status);
-    return out_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4Compress(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jlong chunk_size,
-    jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size, jlong jstream) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressConfigure(
+    JNIEnv *env, jclass, jlong j_chunk_size, jlong j_uncompressed_size) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
     nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    size_t compressed_size = out_size;
-    auto status =
-        nvcompLZ4CompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                               reinterpret_cast<void *>(temp_ptr), temp_size,
-                               reinterpret_cast<void *>(out_ptr), &compressed_size, stream);
+    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto uncompressed_size = static_cast<std::size_t>(j_uncompressed_size);
+    std::size_t metadata_bytes = 0;
+    std::size_t temp_bytes = 0;
+    std::size_t out_bytes = 0;
+    auto status = nvcompLZ4CompressConfigure(&opts, NVCOMP_TYPE_CHAR, uncompressed_size,
+                                             &metadata_bytes, &temp_bytes, &out_bytes);
     check_nvcomp_status(env, status);
-    if (cudaStreamSynchronize(stream) != cudaSuccess) {
-      JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0);
-    }
-    return compressed_size;
+    cudf::jni::native_jlongArray result(env, 3);
+    result[0] = static_cast<jlong>(metadata_bytes);
+    result[1] = static_cast<jlong>(temp_bytes);
+    result[2] = static_cast<jlong>(out_bytes);
+    return result.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4CompressAsync(
-    JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type,
-    jlong chunk_size, jlong temp_ptr, jlong temp_size, jlong out_ptr, jlong out_size,
-    jlong jstream) {
+    JNIEnv *env, jclass, jlong j_compressed_size_ptr, jlong j_in_ptr, jlong j_in_size,
+    jint j_input_type, jlong j_chunk_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr,
+    jlong j_out_size, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
+    auto in_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto in_size = static_cast<std::size_t>(j_in_size);
+    auto comp_type = static_cast<nvcompType_t>(j_input_type);
     nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto compressed_size_ptr = reinterpret_cast<size_t *>(compressed_output_ptr);
-    *compressed_size_ptr = out_size;
-    auto status =
-        nvcompLZ4CompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                               reinterpret_cast<void *>(temp_ptr), temp_size,
-                               reinterpret_cast<void *>(out_ptr), compressed_size_ptr, stream);
+    opts.chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto out_ptr = reinterpret_cast<void *>(j_out_ptr);
+    auto compressed_size_ptr = reinterpret_cast<std::size_t *>(j_compressed_size_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompLZ4CompressAsync(&opts, comp_type, in_ptr, in_size, temp_ptr, temp_size,
+                                         out_ptr, compressed_size_ptr, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetMetadata(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong jstream) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressConfigure(
+    JNIEnv *env, jclass, jlong j_input_ptr, jlong j_input_size, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
+    auto compressed_ptr = reinterpret_cast<void const *>(j_input_ptr);
+    auto compressed_bytes = static_cast<std::size_t>(j_input_size);
     void *metadata_ptr = nullptr;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4DecompressGetMetadata(input_ptrs.data(), sizes.data(),
-                                                        input_ptrs.size(), &metadata_ptr, stream);
-    check_nvcomp_status(env, status);
-    return reinterpret_cast<jlong>(metadata_ptr);
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressDestroyMetadata(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    nvcompBatchedLZ4DecompressDestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
-  }
-  CATCH_STD(env, );
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
-    JNIEnv *env, jclass, jlong metadata_ptr) {
-  try {
-    cudf::jni::auto_set_device(env);
-    size_t temp_size;
+    std::size_t metadata_bytes = 0;
+    std::size_t temp_bytes = 0;
+    std::size_t uncompressed_bytes = 0;
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
     auto status =
-        nvcompBatchedLZ4DecompressGetTempSize(reinterpret_cast<void *>(metadata_ptr), &temp_size);
+        nvcompLZ4DecompressConfigure(compressed_ptr, compressed_bytes, &metadata_ptr,
+                                     &metadata_bytes, &temp_bytes, &uncompressed_bytes, stream);
     check_nvcomp_status(env, status);
-    return static_cast<jlong>(temp_size);
+    cudf::jni::native_jlongArray result(env, 4);
+    result[0] = reinterpret_cast<jlong>(metadata_ptr);
+    result[1] = static_cast<jlong>(metadata_bytes);
+    result[2] = static_cast<jlong>(temp_bytes);
+    result[3] = static_cast<jlong>(uncompressed_bytes);
+    return result.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetOutputSize(
-    JNIEnv *env, jclass, jlong metadata_ptr, jint num_outputs) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DecompressAsync(
+    JNIEnv *env, jclass, jlong j_in_ptr, jlong j_in_size, jlong j_metadata_ptr,
+    jlong j_metadata_size, jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptr, jlong j_out_size,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    std::vector<size_t> sizes(num_outputs);
-    auto status = nvcompBatchedLZ4DecompressGetOutputSize(reinterpret_cast<void *>(metadata_ptr),
-                                                          num_outputs, sizes.data());
+    auto compressed_ptr = reinterpret_cast<void const *>(j_in_ptr);
+    auto compressed_bytes = static_cast<std::size_t>(j_in_size);
+    auto metadata_ptr = reinterpret_cast<void const *>(j_metadata_ptr);
+    auto metadata_bytes = static_cast<std::size_t>(j_metadata_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_bytes = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptr = reinterpret_cast<void *>(j_out_ptr);
+    auto uncompressed_bytes = static_cast<std::size_t>(j_out_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompLZ4DecompressAsync(compressed_ptr, compressed_bytes, metadata_ptr,
+                                           metadata_bytes, temp_ptr, temp_bytes, uncompressed_ptr,
+                                           uncompressed_bytes, stream);
     check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray jsizes(env, num_outputs);
-    std::transform(sizes.begin(), sizes.end(), jsizes.data(),
-                   [](size_t x) -> jlong { return static_cast<jlong>(x); });
-    return jsizes.get_jArray();
   }
-  CATCH_STD(env, NULL);
+  CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong temp_ptr, jlong temp_size,
-    jlong metadata_ptr, jlongArray out_ptrs, jlongArray out_sizes, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_lz4DestroyMetadata(JNIEnv *env, jclass,
+                                                                               jlong metadata_ptr) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    cudf::jni::native_jpointerArray<void> output_ptrs(env, out_ptrs);
-    cudf::jni::native_jlongArray output_jsizes(env, out_sizes);
-    if (output_ptrs.size() != output_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch");
-    }
-    if (input_ptrs.size() != output_ptrs.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch");
-    }
-    std::vector<size_t> output_sizes;
-    std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(),
-                   std::back_inserter(output_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4DecompressAsync(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(),
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size),
-        reinterpret_cast<void *>(metadata_ptr), output_ptrs.data(), output_sizes.data(), stream);
-    check_nvcomp_status(env, status);
+    nvcompLZ4DestroyMetadata(reinterpret_cast<void *>(metadata_ptr));
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size) {
+    JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    size_t temp_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetTempSize(input_ptrs.data(), sizes.data(),
-                                                      input_ptrs.size(), &opts, &temp_size);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size,
+                                                      nvcompBatchedLZ4DefaultOpts, &temp_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(temp_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetOutputSize(
-    JNIEnv *env, jclass, jlongArray in_ptrs, jlongArray in_sizes, jlong chunk_size, jlong temp_ptr,
-    jlong temp_size) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(
+    JNIEnv *env, jclass, jlong j_max_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    std::vector<size_t> output_sizes(input_ptrs.size());
-    auto status = nvcompBatchedLZ4CompressGetOutputSize(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts,
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size), output_sizes.data());
+    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t max_output_size = 0;
+    auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+        max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
     check_nvcomp_status(env, status);
-    cudf::jni::native_jlongArray jsizes(env, input_ptrs.size());
-    std::transform(output_sizes.begin(), output_sizes.end(), jsizes.data(),
-                   [](size_t x) -> jlong { return static_cast<jlong>(x); });
-    return jsizes.get_jArray();
+    return static_cast<jlong>(max_output_size);
   }
-  CATCH_STD(env, NULL);
+  CATCH_STD(env, 0);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(
-    JNIEnv *env, jclass, jlong compressed_sizes_out_ptr, jlongArray in_ptrs, jlongArray in_sizes,
-    jlong chunk_size, jlong temp_ptr, jlong temp_size, jlongArray out_ptrs, jlongArray out_sizes,
-    jlong jstream) {
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size,
+    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::jni::native_jpointerArray<void const> input_ptrs(env, in_ptrs);
-    cudf::jni::native_jlongArray input_jsizes(env, in_sizes);
-    if (input_ptrs.size() != input_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input array size mismatch");
-    }
-    std::vector<size_t> input_sizes;
-    std::transform(input_jsizes.data(), input_jsizes.data() + input_jsizes.size(),
-                   std::back_inserter(input_sizes),
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    cudf::jni::native_jpointerArray<void> output_ptrs(env, out_ptrs);
-    cudf::jni::native_jlongArray output_jsizes(env, out_sizes);
-    if (output_ptrs.size() != output_jsizes.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "output array size mismatch");
-    }
-    if (input_ptrs.size() != output_ptrs.size()) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS, "input/output array size mismatch");
-    }
-
-    auto output_sizes = reinterpret_cast<size_t *>(compressed_sizes_out_ptr);
-    std::transform(output_jsizes.data(), output_jsizes.data() + output_jsizes.size(), output_sizes,
-                   [](jlong x) -> size_t { return static_cast<size_t>(x); });
-
-    nvcompLZ4FormatOpts opts{};
-    opts.chunk_size = chunk_size;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto status = nvcompBatchedLZ4CompressAsync(
-        input_ptrs.data(), input_sizes.data(), input_ptrs.size(), &opts,
-        reinterpret_cast<void *>(temp_ptr), static_cast<size_t>(temp_size), output_ptrs.data(),
-        output_sizes, // input/output parameter
-        stream);
+    auto in_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto in_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t *>(j_compressed_sizes_out_ptr);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr,
+                                                temp_size, out_ptrs, compressed_out_sizes,
+                                                nvcompBatchedLZ4DefaultOpts, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetTempSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp) {
-  try {
-    cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    size_t temp_size;
-    auto status = nvcompCascadedCompressGetTempSize(reinterpret_cast<void *>(in_ptr), in_size,
-                                                    comp_type, &opts, &temp_size);
-    check_nvcomp_status(env, status);
-    return temp_size;
-  }
-  CATCH_STD(env, 0);
-}
-
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressGetOutputSize(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jboolean compute_exact) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
+    JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    size_t out_size;
-    auto status = nvcompCascadedCompressGetOutputSize(
-        reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-        reinterpret_cast<void *>(temp_ptr), temp_size, &out_size, compute_exact);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size);
     check_nvcomp_status(env, status);
-    return out_size;
+    return static_cast<jlong>(temp_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompress(
-    JNIEnv *env, jclass, jlong in_ptr, jlong in_size, jint input_type, jint num_rles,
-    jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr,
-    jlong out_size, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
+    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    size_t compressed_size = out_size;
-    auto status =
-        nvcompCascadedCompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                                    reinterpret_cast<void *>(temp_ptr), temp_size,
-                                    reinterpret_cast<void *>(out_ptr), &compressed_size, stream);
+    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t const *>(j_out_sizes);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
+    auto temp_size = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
+    auto status = nvcompBatchedLZ4DecompressAsync(
+        compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(),
+        batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream);
     check_nvcomp_status(env, status);
-    if (cudaStreamSynchronize(stream) != cudaSuccess) {
-      JNI_THROW_NEW(env, NVCOMP_CUDA_ERROR_CLASS, "Error synchronizing stream", 0);
+    if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(),
+                                               batch_size, stream)) {
+      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS,
+                                      "nvcomp decompress output size mismatch");
     }
-    return compressed_size;
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_cascadedCompressAsync(
-    JNIEnv *env, jclass, jlong compressed_output_ptr, jlong in_ptr, jlong in_size, jint input_type,
-    jint num_rles, jint num_deltas, jboolean use_bp, jlong temp_ptr, jlong temp_size, jlong out_ptr,
-    jlong out_size, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(
+    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
+    jlong j_stream) {
   try {
     cudf::jni::auto_set_device(env);
-    auto comp_type = static_cast<nvcompType_t>(input_type);
-    nvcompCascadedFormatOpts opts{};
-    opts.num_RLEs = num_rles;
-    opts.num_deltas = num_deltas;
-    opts.use_bp = use_bp;
-    auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto compressed_size_ptr = reinterpret_cast<size_t *>(compressed_output_ptr);
-    *compressed_size_ptr = out_size;
-    auto status =
-        nvcompCascadedCompressAsync(reinterpret_cast<void *>(in_ptr), in_size, comp_type, &opts,
-                                    reinterpret_cast<void *>(temp_ptr), temp_size,
-                                    reinterpret_cast<void *>(out_ptr), compressed_size_ptr, stream);
+    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
+    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t *>(j_out_sizes);
+    auto batch_size = static_cast<std::size_t>(j_batch_size);
+    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes,
+                                                         uncompressed_sizes, batch_size, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
diff --git a/java/src/main/native/src/NvtxRangeJni.cpp b/java/src/main/native/src/NvtxRangeJni.cpp
index 3e50327be8b..1f12b2ea8cc 100644
--- a/java/src/main/native/src/NvtxRangeJni.cpp
+++ b/java/src/main/native/src/NvtxRangeJni.cpp
@@ -17,14 +17,7 @@
 #include <cudf/detail/nvtx/nvtx3.hpp>
 
 #include "jni_utils.hpp"
-
-namespace {
-
-struct java_domain {
-  static constexpr char const *name{"Java"};
-};
-
-} // anonymous namespace
+#include "nvtx_common.hpp"
 
 extern "C" {
 
@@ -34,14 +27,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass cl
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
     nvtx3::event_attributes attr{range_color, range_name.get()};
-    nvtxDomainRangePushEx(nvtx3::domain::get<java_domain>(), attr.get());
+    nvtxDomainRangePushEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv *env, jclass clazz) {
   try {
-    nvtxDomainRangePop(nvtx3::domain::get<java_domain>());
+    nvtxDomainRangePop(nvtx3::domain::get<cudf::jni::java_domain>());
   }
   CATCH_STD(env, );
 }
diff --git a/java/src/main/native/src/NvtxUniqueRangeJni.cpp b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
new file mode 100644
index 00000000000..d6c321b5fd2
--- /dev/null
+++ b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/nvtx3.hpp>
+
+#include "jni_utils.hpp"
+#include "nvtx_common.hpp"
+
+extern "C" {
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv *env, jclass clazz,
+                                                                  jstring name, jint color_bits) {
+  try {
+    cudf::jni::native_jstring range_name(env, name);
+    nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
+    nvtx3::event_attributes attr{range_color, range_name.get()};
+    auto nvtxRangeId =
+        nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
+    return static_cast<jlong>(nvtxRangeId);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jclass clazz,
+                                                               jlong nvtxRangeId) {
+  try {
+    nvtxDomainRangeEnd(nvtx3::domain::get<cudf::jni::java_domain>(),
+                       static_cast<nvtxRangeId_t>(nvtxRangeId));
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 1cf8b4837cb..d07b754c8db 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -210,17 +210,6 @@ class java_event_handler_memory_resource final : public device_memory_resource {
   }
 
   bool on_alloc_fail(std::size_t num_bytes) {
-    cudaError_t err = cudaPeekAtLastError();
-    if (err != cudaSuccess) {
-      // workaround for RMM pooled mode (CNMEM backend) leaving a CUDA error pending
-      if (err == cudaErrorMemoryAllocation) {
-        cudaGetLastError();
-      } else {
-        // let this allocation fail so the application can see the CUDA error
-        return false;
-      }
-    }
-
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
     jboolean result =
         env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
@@ -256,7 +245,7 @@ class java_event_handler_memory_resource final : public device_memory_resource {
         total_before = get_total_bytes_allocated();
         result = resource->allocate(num_bytes, stream);
         break;
-      } catch (std::bad_alloc const &e) {
+      } catch (rmm::out_of_memory const &e) {
         if (!on_alloc_fail(num_bytes)) {
           throw;
         }
@@ -333,9 +322,9 @@ std::unique_ptr<rmm::mr::cuda_memory_resource> Cuda_memory_resource{};
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
-    JNIEnv *env, jclass clazz, jint allocation_mode, jint log_to, jstring jpath, jlong pool_size,
-    jlong max_pool_size, jlong allocation_alignment, jlong alignment_threshold) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, jclass clazz,
+                                                                  jint allocation_mode, jint log_to,
+                                                                  jstring jpath, jlong pool_size) {
   try {
     // make sure the CUDA device is setup in the context
     cudaError_t cuda_status = cudaFree(0);
@@ -349,51 +338,33 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(
     bool use_arena_alloc = allocation_mode & 4;
     bool use_cuda_async_alloc = allocation_mode & 8;
     if (use_pool_alloc) {
-      auto pool_limit = (max_pool_size > 0) ?
-                            thrust::optional<std::size_t>{static_cast<std::size_t>(max_pool_size)} :
-                            thrust::nullopt;
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
+            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
+            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
       }
     } else if (use_arena_alloc) {
-      std::size_t pool_limit = (max_pool_size > 0) ? static_cast<std::size_t>(max_pool_size) :
-                                                     std::numeric_limits<std::size_t>::max();
       if (use_managed_mem) {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_limit);
+            std::make_shared<rmm::mr::managed_memory_resource>(), pool_size, pool_size);
       } else {
         Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(
-            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_limit);
+            std::make_shared<rmm::mr::cuda_memory_resource>(), pool_size, pool_size);
       }
     } else if (use_cuda_async_alloc) {
-      auto const pool_limit = max_pool_size > 0 ? static_cast<std::size_t>(max_pool_size) :
-                                                  std::numeric_limits<std::size_t>::max();
-      auto const release_threshold = max_pool_size > 0 ?
-                                         thrust::optional<std::size_t>{max_pool_size} :
-                                         thrust::optional<std::size_t>{};
       // Use `limiting_resource_adaptor` to set a hard limit on the max pool size since
       // `cuda_async_memory_resource` only has a release threshold.
       Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::limiting_resource_adaptor>(
-          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, release_threshold),
-          pool_limit);
+          std::make_shared<rmm::mr::cuda_async_memory_resource>(pool_size, pool_size), pool_size);
     } else if (use_managed_mem) {
       Initialized_resource = std::make_shared<rmm::mr::managed_memory_resource>();
     } else {
       Initialized_resource = std::make_shared<rmm::mr::cuda_memory_resource>();
     }
 
-    if (allocation_alignment != 0) {
-      Initialized_resource = rmm::mr::make_owning_wrapper<rmm::mr::aligned_resource_adaptor>(
-          Initialized_resource, allocation_alignment, alignment_threshold);
-    }
-
-    auto wrapped = make_tracking_adaptor(
-        Initialized_resource.get(),
-        std::max(RMM_ALLOC_SIZE_ALIGNMENT, static_cast<std::size_t>(allocation_alignment)));
+    auto wrapped = make_tracking_adaptor(Initialized_resource.get(), RMM_ALLOC_SIZE_ALIGNMENT);
     Tracking_memory_resource.reset(wrapped);
 
     auto resource = Tracking_memory_resource.get();
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ee75112a2ed..c66cf13a5ae 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -143,6 +143,13 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     stream.synchronize();
   }
 
+  std::future<void> device_write_async(void const *gpu_data, size_t size,
+                                       rmm::cuda_stream_view stream) override {
+    // Call the sync version until figuring out how to write asynchronously.
+    device_write(gpu_data, size, stream);
+    return std::async(std::launch::deferred, [] {});
+  }
+
   void flush() override {
     if (current_buffer_written > 0) {
       JNIEnv *env = cudf::jni::get_jni_env(jvm);
@@ -677,8 +684,10 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
     cudf::io::column_in_metadata child;
     child.set_name(col_names[read_index])
         .set_decimal_precision(precisions[read_index])
-        .set_int96_timestamps(is_int96[read_index])
         .set_nullability(nullability[read_index]);
+    if (!is_int96.is_null()) {
+      child.set_int96_timestamps(is_int96[read_index]);
+    }
     if (is_map[read_index]) {
       child.set_list_column_as_map();
     }
@@ -696,13 +705,12 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
 void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
                          jintArray &j_children, jbooleanArray &j_col_nullability,
                          jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values,
-                         jint j_compression, jint j_stats_freq, jbooleanArray &j_isInt96,
-                         jintArray &j_precisions, jbooleanArray &j_is_map,
-                         cudf::io::table_input_metadata &metadata) {
+                         jbooleanArray &j_is_int96, jintArray &j_precisions,
+                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
-  cudf::jni::native_jbooleanArray isInt96(env, j_isInt96);
+  cudf::jni::native_jbooleanArray is_int96(env, j_is_int96);
   cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
   cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
   cudf::jni::native_jintArray precisions(env, j_precisions);
@@ -719,8 +727,10 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     metadata.column_metadata[write_index]
         .set_name(cpp_names[read_index])
         .set_nullability(col_nullability[read_index])
-        .set_int96_timestamps(isInt96[read_index])
         .set_decimal_precision(precisions[read_index]);
+    if (!is_int96.is_null()) {
+      metadata.column_metadata[write_index].set_int96_timestamps(is_int96[read_index]);
+    }
     if (is_map[read_index]) {
       metadata.column_metadata[write_index].set_list_column_as_map();
     }
@@ -728,7 +738,7 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (childs_children > 0) {
       read_index =
           set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              isInt96, precisions, is_map, children, childs_children, read_index);
+                              is_int96, precisions, is_map, children, childs_children, read_index);
     }
   }
   for (auto i = 0; i < meta_keys.size(); ++i) {
@@ -736,29 +746,6 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   }
 }
 
-cudf::io::table_input_metadata createORCTableInputMetadata(JNIEnv *env,
-                                                           jobjectArray const &j_col_names,
-                                                           jbooleanArray const &j_col_nullability,
-                                                           jobjectArray const &j_metadata_keys,
-                                                           jobjectArray const &j_metadata_values) {
-  cudf::jni::native_jstringArray const col_names(env, j_col_names);
-  cudf::jni::native_jbooleanArray const col_nullability(env, j_col_nullability);
-  cudf::jni::native_jstringArray const meta_keys(env, j_metadata_keys);
-  cudf::jni::native_jstringArray const meta_values(env, j_metadata_values);
-
-  std::vector<std::string> const cpp_names = col_names.as_cpp_vector();
-  std::size_t const num_columns = cpp_names.size();
-  cudf::io::table_input_metadata metadata;
-  metadata.column_metadata.resize(cpp_names.size());
-  for (std::size_t i = 0; i < num_columns; i++) {
-    metadata.column_metadata[i].set_name(cpp_names[i]).set_nullability(col_nullability[i]);
-  }
-  for (int i = 0; i < meta_keys.size(); ++i) {
-    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-  }
-  return metadata;
-}
-
 // Check that window parameters are valid.
 bool valid_window_parameters(native_jintArray const &values,
                              native_jpointerArray<cudf::aggregation> const &ops,
@@ -1376,8 +1363,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
+                        metadata);
 
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1410,8 +1397,8 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::jni;
     table_input_metadata metadata;
     createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_compression, j_stats_freq, j_isInt96,
-                        j_precisions, j_is_map, metadata);
+                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
+                        metadata);
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
@@ -1512,9 +1499,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability,
-    jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression,
-    jobject consumer) {
+    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
+    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
+    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1523,8 +1510,13 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
-    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
-        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
+    using namespace cudf::jni;
+    table_input_metadata metadata;
+    // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_is_int96 = NULL;
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
+                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
+                        metadata);
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1543,9 +1535,9 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jbooleanArray j_col_nullability,
-    jobjectArray j_metadata_keys, jobjectArray j_metadata_values, jint j_compression,
-    jstring j_output_path) {
+    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
+    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
+    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jstring j_output_path) {
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1554,10 +1546,14 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
   try {
     cudf::jni::auto_set_device(env);
     using namespace cudf::io;
+    using namespace cudf::jni;
     cudf::jni::native_jstring output_path(env, j_output_path);
-
-    table_input_metadata metadata = cudf::jni::createORCTableInputMetadata(
-        env, j_col_names, j_col_nullability, j_metadata_keys, j_metadata_values);
+    table_input_metadata metadata;
+    // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
+    jbooleanArray j_is_int96 = NULL;
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
+                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
+                        metadata);
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu
new file mode 100644
index 00000000000..944399882b8
--- /dev/null
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/detail/nvtx/nvtx3.hpp>
+#include <cudf/utilities/error.hpp>
+#include <thrust/device_ptr.h>
+#include <thrust/equal.h>
+
+#include "check_nvcomp_output_sizes.hpp"
+
+namespace {
+
+struct java_domain {
+  static constexpr char const *name{"Java"};
+};
+
+} // anonymous namespace
+
+namespace cudf {
+namespace java {
+
+/**
+ * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
+ * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
+ */
+bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
+                               std::size_t const *dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks, rmm::cuda_stream_view stream) {
+  NVTX3_FUNC_RANGE_IN(java_domain);
+  return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes,
+                       dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes);
+}
+
+} // namespace java
+} // namespace cudf
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
new file mode 100644
index 00000000000..00b36471a85
--- /dev/null
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+namespace cudf {
+namespace java {
+
+/**
+ * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
+ * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
+ */
+bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
+                               std::size_t const *dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks, rmm::cuda_stream_view stream);
+} // namespace java
+} // namespace cudf
diff --git a/cpp/src/io/json/json_common.h b/java/src/main/native/src/nvtx_common.hpp
similarity index 74%
rename from cpp/src/io/json/json_common.h
rename to java/src/main/native/src/nvtx_common.hpp
index 803b937e58d..8b5b04f3370 100644
--- a/cpp/src/io/json/json_common.h
+++ b/java/src/main/native/src/nvtx_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,12 @@
 
 #pragma once
 
-#include <cudf/types.hpp>
-#include <io/utilities/column_buffer.hpp>
-#include <io/utilities/column_type_histogram.hpp>
+namespace cudf {
+namespace jni {
 
-using cudf::io::detail::string_index_pair;
+struct java_domain {
+  static constexpr char const *name{"Java"};
+};
+
+} // namespace jni
+} // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d1af0d9a2f6..c767a98b342 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3085,6 +3085,17 @@ void testWindowStatic() {
              ColumnVector result = v1.rollingWindow(RollingAggregation.lag(1, defaultOutput), options)) {
           assertColumnsAreEqual(expected, result);
         }
+
+        try (ColumnVector expected = ColumnVector.fromBoxedDoubles(0.7071d, 1.5275d, 1.5275d, 1d, 1.4142);
+             ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
+
+        try (ColumnVector expected =
+                 ColumnVector.fromBoxedDoubles(Double.POSITIVE_INFINITY, 2.1602d, 2.1602d, 1.4142d, Double.POSITIVE_INFINITY);
+             ColumnVector result = v1.rollingWindow(RollingAggregation.standardDeviation(2), options)) {
+          assertColumnsAreEqual(expected, result);
+        }
       }
     }
   }
diff --git a/java/src/test/java/ai/rapids/cudf/CudfTestBase.java b/java/src/test/java/ai/rapids/cudf/CudfTestBase.java
index 66ecd110983..a4450e2869c 100644
--- a/java/src/test/java/ai/rapids/cudf/CudfTestBase.java
+++ b/java/src/test/java/ai/rapids/cudf/CudfTestBase.java
@@ -43,7 +43,7 @@ public CudfTestBase(int allocationMode, long poolSize) {
   void beforeEach() {
     assumeTrue(Cuda.isEnvCompatibleForTesting());
     if (!Rmm.isInitialized()) {
-      Rmm.initialize(rmmAllocationMode, false, rmmPoolSize);
+      Rmm.initialize(rmmAllocationMode, Rmm.logToStderr(), rmmPoolSize);
     }
   }
 
diff --git a/java/src/test/java/ai/rapids/cudf/NvtxTest.java b/java/src/test/java/ai/rapids/cudf/NvtxTest.java
new file mode 100644
index 00000000000..f01b0a19eb5
--- /dev/null
+++ b/java/src/test/java/ai/rapids/cudf/NvtxTest.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class NvtxTest {
+  @Test
+  public void testNvtxStartEndEnclosed() {
+    NvtxUniqueRange range1 = new NvtxUniqueRange("start/end", NvtxColor.RED);
+    NvtxUniqueRange range2 = new NvtxUniqueRange("enclosed start/end", NvtxColor.BLUE);
+    range2.close();
+    range1.close();
+  }
+
+  @Test
+  public void testNvtxStartEndCloseOutOfOrder() {
+    NvtxUniqueRange range1 = new NvtxUniqueRange("start/end closes first", NvtxColor.RED);
+    NvtxUniqueRange range2 = new NvtxUniqueRange("start/end closes later", NvtxColor.BLUE);
+    range1.close();
+    range2.close();
+  }
+
+  @Test
+  public void testNvtxPushPop() {
+    try(NvtxRange range1 = new NvtxRange("push/pop", NvtxColor.RED)) {
+      try(NvtxRange range2 = new NvtxRange("enclosed push/pop", NvtxColor.BLUE)) {
+      }
+    }
+  }
+
+  @Test
+  public void testNvtxPushPopEnclosingStartEnd() {
+    try(NvtxRange range1 = new NvtxRange("push/pop", NvtxColor.RED)) {
+      NvtxUniqueRange range2 = new NvtxUniqueRange("enclosed start/end", NvtxColor.BLUE);
+      range2.close();
+    }
+  }
+
+  @Test
+  public void testNvtxPushPopAndStartEndCloseOutOfOrder() {
+    NvtxUniqueRange range2;
+    try(NvtxRange range1 = new NvtxRange("push/pop closes first", NvtxColor.RED)) {
+      range2 = new NvtxUniqueRange("start/end closes later", NvtxColor.BLUE);
+    }
+    range2.close();
+  }
+
+  @Test
+  public void testNvtxUniqueRangeCloseMultipleTimes() {
+    NvtxUniqueRange range = new NvtxUniqueRange("range", NvtxColor.RED);
+    range.close();
+    assertThrows(IllegalStateException.class, () -> {
+      range.close();
+    });
+  }
+}
diff --git a/java/src/test/java/ai/rapids/cudf/RmmMemoryAccessorTest.java b/java/src/test/java/ai/rapids/cudf/RmmMemoryAccessorTest.java
index 81afdcc1940..a9ee36e9b97 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmMemoryAccessorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmMemoryAccessorTest.java
@@ -63,7 +63,7 @@ public void init() {
       Rmm.shutdown();
     }
     assertFalse(Rmm.isInitialized());
-    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, true, -1);
+    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), -1);
     assertTrue(Rmm.isInitialized());
     Rmm.shutdown();
     assertFalse(Rmm.isInitialized());
@@ -74,7 +74,7 @@ public void shutdown() {
     if (Rmm.isInitialized()) {
       Rmm.shutdown();
     }
-    Rmm.initialize(RmmAllocationMode.POOL, false, 2048);
+    Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 2048);
     try (DeviceMemoryBuffer buffer = DeviceMemoryBuffer.allocate(1024)) {
       assertThrows(RmmException.class, () -> Rmm.shutdown(500, 2000, TimeUnit.MILLISECONDS));
     }
@@ -91,9 +91,9 @@ public void allocate() {
   @Test
   public void doubleInitFails() {
     if (!Rmm.isInitialized()) {
-      Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0);
+      Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0);
     }
     assertThrows(IllegalStateException.class,
-        () -> Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024));
+        () -> Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024));
   }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index f0ed699c9b9..f9d097158b6 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -57,7 +57,7 @@ public void teardown() {
       RmmAllocationMode.POOL,
       RmmAllocationMode.ARENA})
   public void testTotalAllocated(int rmmAllocMode) {
-    Rmm.initialize(rmmAllocMode, false, 512 * 1024 * 1024);
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
     assertEquals(0, Rmm.getTotalBytesAllocated());
     try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
       assertEquals(1024, Rmm.getTotalBytesAllocated());
@@ -110,7 +110,7 @@ public boolean onAllocFailure(long sizeRequested) {
 
   @Test
   public void testSetEventHandlerTwice() {
-    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0L);
+    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
     // installing an event handler the first time should not be an error
     Rmm.setEventHandler(new BaseRmmEventHandler() {
       @Override
@@ -131,7 +131,7 @@ public boolean onAllocFailure(long sizeRequested) {
 
   @Test
   public void testClearEventHandler() {
-    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 0L);
+    Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, Rmm.logToStderr(), 0L);
     // clearing the event handler when it isn't set is not an error
     Rmm.clearEventHandler();
 
@@ -161,7 +161,7 @@ public void testAllocOnlyThresholds() {
     final AtomicInteger deallocInvocations = new AtomicInteger(0);
     final AtomicLong allocated = new AtomicLong(0);
 
-    Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
+    Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
 
     RmmEventHandler handler = new RmmEventHandler() {
       @Override
@@ -304,7 +304,7 @@ public void onDeallocThreshold(long totalAllocSize) {
 
   @Test
   public void testExceptionHandling() {
-    Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
+    Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
 
     RmmEventHandler handler = new RmmEventHandler() {
       @Override
@@ -344,7 +344,7 @@ public void onDeallocThreshold(long totalAllocSize) {
   public void testThreadAutoDeviceSetup() throws Exception {
     // A smoke-test for automatic CUDA device setup for threads calling
     // into cudf. Hard to fully test without requiring multiple CUDA devices.
-    Rmm.initialize(RmmAllocationMode.POOL, false, 1024 * 1024L);
+    Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024 * 1024L);
     DeviceMemoryBuffer buff = Rmm.alloc(1024);
     try {
       ExecutorService executor = Executors.newSingleThreadExecutor();
@@ -368,62 +368,38 @@ public void testThreadAutoDeviceSetup() throws Exception {
       RmmAllocationMode.POOL,
       RmmAllocationMode.ARENA})
   public void testSetDeviceThrowsAfterRmmInit(int rmmAllocMode) {
-    Rmm.initialize(rmmAllocMode, false, 1024 * 1024);
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 1024 * 1024);
     assertThrows(CudfException.class, () -> Cuda.setDevice(Cuda.getDevice() + 1));
     // Verify that auto set device does not
     Cuda.autoSetDevice();
   }
 
   @Test
-  public void testPoolGrowth() {
-    Rmm.initialize(RmmAllocationMode.POOL, false, 1024);
-    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024);
-         DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
-         DeviceMemoryBuffer ignored3 = Rmm.alloc(4096)) {
-      assertEquals(7168, Rmm.getTotalBytesAllocated());
-    }
-  }
-
-  @Test
-  public void testPoolLimit() {
-    Rmm.initialize(RmmAllocationMode.POOL, false, 1024, 2048);
-    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
-         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+  public void testPoolSize() {
+    Rmm.initialize(RmmAllocationMode.POOL, Rmm.logToStderr(), 1024);
+    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024)) {
       assertThrows(OutOfMemoryError.class,
           () -> {
-            DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
-            ignored3.close();
+            DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+            ignored2.close();
       });
     }
   }
 
   @Test
-  public void testPoolLimitLessThanInitialSize() {
-    assertThrows(IllegalArgumentException.class,
-        () -> Rmm.initialize(RmmAllocationMode.POOL, false, 10240, 1024));
-  }
-
-  @Test
-  public void testPoolLimitNonPoolMode() {
-    assertThrows(IllegalArgumentException.class,
-        () -> Rmm.initialize(RmmAllocationMode.CUDA_DEFAULT, false, 1024, 2048));
-  }
-
-  @Test
-  public void testCudaAsyncMemoryResourceLimit() {
+  public void testCudaAsyncMemoryResourceSize() {
     try {
-      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, false, 1024, 2048);
+      Rmm.initialize(RmmAllocationMode.CUDA_ASYNC, Rmm.logToStderr(), 1024);
     } catch (CudfException e) {
       // CUDA 11.2 introduced cudaMallocAsync, older CUDA Toolkit will skip this test.
       assumeFalse(e.getMessage().contains("cudaMallocAsync not supported"));
       throw e;
     }
-    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(512);
-         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+    try (DeviceMemoryBuffer ignored1 = Rmm.alloc(1024)) {
       assertThrows(OutOfMemoryError.class,
           () -> {
-            DeviceMemoryBuffer ignored3 = Rmm.alloc(1024);
-            ignored3.close();
+            DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+            ignored2.close();
           });
     }
   }
@@ -433,12 +409,12 @@ public void testCudaAsyncIsIncompatibleWithManaged() {
     assertThrows(IllegalArgumentException.class,
         () -> Rmm.initialize(
             RmmAllocationMode.CUDA_ASYNC | RmmAllocationMode.CUDA_MANAGED_MEMORY,
-            false, 1024, 2048));
+            Rmm.logToStderr(), 1024));
   }
 
   @Test
   public void testCudaMemoryBuffer() {
-    Rmm.initialize(RmmAllocationMode.ARENA, false, 1024);
+    Rmm.initialize(RmmAllocationMode.ARENA, Rmm.logToStderr(), 1024);
     try (CudaMemoryBuffer one = CudaMemoryBuffer.allocate(512);
          CudaMemoryBuffer two = CudaMemoryBuffer.allocate(1024)) {
       assertEquals(512, one.length);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 9c2f194375a..280a4d33ae9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -52,7 +52,7 @@
 import java.util.*;
 import java.util.stream.Collectors;
 
-import static ai.rapids.cudf.ParquetColumnWriterOptions.mapColumn;
+import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -3147,6 +3147,28 @@ void testSerializationRoundTripConcatOnHostEmpty() throws IOException {
     }
   }
 
+  @Test
+  void testSerializationRoundTripToHostEmpty() throws IOException {
+    DataType listStringsType = new ListType(true, new BasicType(true, DType.STRING));
+    DataType mapType = new ListType(true,
+            new StructType(true,
+                    new BasicType(false, DType.STRING),
+                    new BasicType(false, DType.STRING)));
+    DataType structType = new StructType(true,
+            new BasicType(true, DType.INT8),
+            new BasicType(false, DType.FLOAT32));
+    try (ColumnVector emptyInt = ColumnVector.fromInts();
+         ColumnVector emptyDouble = ColumnVector.fromDoubles();
+         ColumnVector emptyString = ColumnVector.fromStrings();
+         ColumnVector emptyListString = ColumnVector.fromLists(listStringsType);
+         ColumnVector emptyMap = ColumnVector.fromLists(mapType);
+         ColumnVector emptyStruct = ColumnVector.fromStructs(structType);
+         Table t = new Table(emptyInt, emptyInt, emptyDouble, emptyString,
+                 emptyListString, emptyMap, emptyStruct)) {
+      testSerializationRoundTripToHost(t);
+    }
+  }
+
   @Test
   void testRoundRobinPartition() {
     try (Table t = new Table.TestBuilder()
@@ -3285,6 +3307,49 @@ void testSerializationRoundTripConcatHostSide() throws IOException {
     }
   }
 
+  @Test
+  void testSerializationRoundTripToHost() throws IOException {
+    try (Table t = buildTestTable()) {
+      testSerializationRoundTripToHost(t);
+    }
+  }
+
+  private void testSerializationRoundTripToHost(Table t) throws IOException {
+    long rowCount = t.getRowCount();
+    ByteArrayOutputStream bout = new ByteArrayOutputStream();
+    JCudfSerialization.writeToStream(t, bout, 0, rowCount);
+    ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
+    DataInputStream din = new DataInputStream(bin);
+
+    JCudfSerialization.SerializedTableHeader header =
+            new JCudfSerialization.SerializedTableHeader(din);
+    assertTrue(header.wasInitialized());
+    try (HostMemoryBuffer buffer = HostMemoryBuffer.allocate(header.getDataLen())) {
+      JCudfSerialization.readTableIntoBuffer(din, header, buffer);
+      assertTrue(header.wasDataRead());
+      HostColumnVector[] hostColumns =
+              JCudfSerialization.unpackHostColumnVectors(header, buffer);
+      try {
+        assertEquals(t.getNumberOfColumns(), hostColumns.length);
+        for (int i = 0; i < hostColumns.length; i++) {
+          HostColumnVector actual = hostColumns[i];
+          assertEquals(rowCount, actual.getRowCount());
+          try (HostColumnVector expected = t.getColumn(i).copyToHost()) {
+            assertPartialColumnsAreEqual(expected, 0, rowCount, actual, "COLUMN " + i, true, false);
+          }
+        }
+      } finally {
+        for (HostColumnVector c: hostColumns) {
+          // close child columns for multiple times should NOT throw exceptions
+          for (int i = 0; i < c.getNumChildren(); i++) {
+            c.getChildColumnView(i).close();
+          }
+          c.close();
+        }
+      }
+    }
+  }
+
   @Test
   void testConcatHost() throws IOException {
     try (Table t1 = new Table.TestBuilder()
@@ -6767,8 +6832,8 @@ void testParquetWriteToBufferChunkedInt96() {
   void testParquetWriteMap() throws IOException {
     ParquetWriterOptions options = ParquetWriterOptions.builder()
         .withMapColumn(mapColumn("my_map",
-            new ParquetColumnWriterOptions("key0", false),
-            new ParquetColumnWriterOptions("value0"))).build();
+            new ColumnWriterOptions("key0", false),
+            new ColumnWriterOptions("value0"))).build();
     File f = File.createTempFile("test-map", ".parquet");
     List<HostColumnVector.StructData> list1 =
         Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
@@ -6984,11 +7049,9 @@ void testArrowIPCWriteToBufferChunked() {
 
   @Test
   void testORCWriteToBufferChunked() {
-    try (Table table0 = getExpectedFileTable();
+    try (Table table0 = getExpectedFileTable(true);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
-      String[] colNames = new String[table0.getNumberOfColumns()];
-      Arrays.fill(colNames, "");
-      ORCWriterOptions opts = ORCWriterOptions.builder().withColumnNames(colNames).build();
+      ORCWriterOptions opts = createORCWriterOptionsWithNested();
       try (TableWriter writer = Table.writeORCChunked(opts, consumer)) {
         writer.write(table0);
         writer.write(table0);
@@ -7001,6 +7064,50 @@ void testORCWriteToBufferChunked() {
     }
   }
 
+  @Test
+  void testORCWriteToFileChunked() throws IOException {
+    File tempFile = File.createTempFile("test", ".orc");
+    try (Table table0 = getExpectedFileTable(true)) {
+      ORCWriterOptions opts = createORCWriterOptionsWithNested();
+      try (TableWriter writer = Table.writeORCChunked(opts, tempFile.getAbsoluteFile())) {
+        writer.write(table0);
+      }
+      try (Table table1 = Table.readORC(tempFile.getAbsoluteFile())) {
+        assertTablesAreEqual(table0, table1);
+      }
+    } finally {
+      tempFile.delete();
+    }
+  }
+
+  @Test
+  void testORCWriteMapChunked() throws IOException {
+    ORCWriterOptions options = ORCWriterOptions.builder()
+            .withMapColumn(mapColumn("my_map",
+                    new ColumnWriterOptions("key0", false),
+                    new ColumnWriterOptions("value0"))).build();
+    File f = File.createTempFile("test-map", ".parquet");
+    List<HostColumnVector.StructData> list1 =
+            Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "b")));
+    List<HostColumnVector.StructData> list2 =
+            Arrays.asList(new HostColumnVector.StructData(Arrays.asList("a", "c")));
+    List<HostColumnVector.StructData> list3 =
+            Arrays.asList(new HostColumnVector.StructData(Arrays.asList("e", "d")));
+    HostColumnVector.StructType structType = new HostColumnVector.StructType(true,
+            Arrays.asList(new HostColumnVector.BasicType(true, DType.STRING),
+                    new HostColumnVector.BasicType(true, DType.STRING)));
+    try (ColumnVector listColumn = ColumnVector.fromLists(new HostColumnVector.ListType(true,
+            structType), list1, list2, list3);
+         Table t0 = new Table(listColumn)) {
+      try (TableWriter writer = Table.writeORCChunked(options, f)) {
+        writer.write(t0);
+      }
+      try (Table res = Table.readORC(f)) {
+        assertTablesAreEqual(t0, res);
+      }
+    }
+  }
+
   @Test
   void testORCWriteToFile() throws IOException {
     File tempFile = File.createTempFile("test", ".orc");
@@ -7020,7 +7127,7 @@ void testORCWriteToFileWithColNames() throws IOException {
     final String[] colNames = new String[]{"bool", "int", "byte","long","str","float","double"};
     try (Table table0 = getExpectedFileTable()) {
       ORCWriterOptions options = ORCWriterOptions.builder()
-          .withColumnNames(colNames)
+          .withColumns(true, colNames)
           .withMetadata("somekey", "somevalue")
           .build();
       table0.writeORC(options, tempFile.getAbsoluteFile());
@@ -7040,7 +7147,7 @@ void testORCWriteToFileUncompressed() throws IOException {
       String[] colNames = new String[table0.getNumberOfColumns()];
       Arrays.fill(colNames, "");
       ORCWriterOptions opts = ORCWriterOptions.builder()
-              .withColumnNames(colNames)
+              .withColumns(true, colNames)
               .withCompressionType(CompressionType.NONE)
               .build();
       table0.writeORC(opts, tempFileUncompressed.getAbsoluteFile());
@@ -7142,6 +7249,26 @@ void fixedWidthRowsRoundTrip() {
 
   // utility methods to reduce typing
 
+  private ORCWriterOptions createORCWriterOptionsWithNested() {
+    // The column metadata should match the table returned from
+    // 'getExpectedFileTable(true)'.
+    return ORCWriterOptions.builder()
+        .withNullableColumns("_c0", "_c1", "_c2", "_c3", "_c4", "_c5", "_c6")
+        .withStructColumn(structBuilder("_c7")
+            .withNullableColumns("_c7-1")
+            .withNullableColumns("_c7-2")
+            .build())
+        .withListColumn(listBuilder("_c8")
+            .withNullableColumns("_c8-1").build())
+        .withListColumn(listBuilder("_c9")
+            .withStructColumn(structBuilder("_c9-1")
+                .withNullableColumns("_c9-1-1")
+                .withNullableColumns("_c9-1-2")
+                .build())
+            .build())
+        .build();
+  }
+
   private StructData struct(Object... values) {
     return new StructData(values);
   }
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index a41cc22e9b2..c36d241500a 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,18 +29,20 @@ public class NvcompTest {
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
   @Test
-  void testLZ4RoundTripSync() {
+  void testLZ4RoundTripViaLZ4DecompressorSync() {
     lz4RoundTrip(false);
   }
 
   @Test
-  void testLZ4RoundTripAsync() {
+  void testLZ4RoundTripViaLZ4DecompressorAsync() {
     lz4RoundTrip(true);
   }
 
   @Test
   void testBatchedLZ4RoundTripAsync() {
+    final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
     final long chunkSize = 64 * 1024;
+    final long targetIntermediteSize = Long.MAX_VALUE;
     final int maxElements = 1024 * 1024 + 1;
     final int numBuffers = 200;
     long[] data = new long[maxElements];
@@ -48,149 +50,52 @@ void testBatchedLZ4RoundTripAsync() {
       data[i] = i;
     }
 
-    DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers];
-    DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers];
-
-    // compressed data in buffers that are likely oversized
-    DeviceMemoryBuffer[] compressedBuffers = new DeviceMemoryBuffer[numBuffers];
-
-    // compressed data in right-sized buffers
-    DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers];
-
-    try {
+    try (CloseableArray<DeviceMemoryBuffer> originalBuffers =
+             CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
       // create the batched buffers to compress
-      for (int i = 0; i < numBuffers; ++i) {
-        originalBuffers[i] = initBatchBuffer(data, i);
+      for (int i = 0; i < originalBuffers.size(); i++) {
+        originalBuffers.set(i, initBatchBuffer(data, i));
+        // Increment the refcount since compression will try to close it
+        originalBuffers.get(i).incRefCount();
       }
 
-      // compress the buffers
-      long[] outputSizes;
-      long[] compressedSizes;
-      long tempSize = BatchedLZ4Compressor.getTempSize(originalBuffers, chunkSize);
-      try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-        outputSizes = BatchedLZ4Compressor.getOutputSizes(originalBuffers, chunkSize, tempBuffer);
-        for (int i = 0; i < numBuffers; ++i) {
-          compressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]);
-        }
-        long sizesBufferSize = BatchedLZ4Compressor.getCompressedSizesBufferSize(numBuffers);
-        try (HostMemoryBuffer compressedSizesBuffer = HostMemoryBuffer.allocate(sizesBufferSize)) {
-          BatchedLZ4Compressor.compressAsync(compressedSizesBuffer, originalBuffers, chunkSize,
-              tempBuffer, compressedBuffers, Cuda.DEFAULT_STREAM);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSizes = new long[numBuffers];
-          for (int i = 0; i < numBuffers; ++i) {
-            compressedSizes[i] = compressedSizesBuffer.getLong(i * 8);
-          }
-        }
-      }
-
-      // right-size the compressed buffers based on reported compressed sizes
-      for (int i = 0; i < numBuffers; ++i) {
-        compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]);
-      }
-
-      // decompress the buffers
-      try (BatchedLZ4Decompressor.BatchedMetadata metadata =
-               BatchedLZ4Decompressor.getMetadata(compressedInputs, Cuda.DEFAULT_STREAM)) {
-        outputSizes = BatchedLZ4Decompressor.getOutputSizes(metadata, numBuffers);
-        for (int i = 0; i < numBuffers; ++i) {
-          uncompressedBuffers[i] = DeviceMemoryBuffer.allocate(outputSizes[i]);
-        }
-        tempSize = BatchedLZ4Decompressor.getTempSize(metadata);
-        try (DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempSize)) {
-          BatchedLZ4Decompressor.decompressAsync(compressedInputs, tempBuffer, metadata,
-              uncompressedBuffers, Cuda.DEFAULT_STREAM);
-        }
-      }
+      // compress and decompress the buffers
+      BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize);
 
-      // check the decompressed results against the original
-      for (int i = 0; i < numBuffers; ++i) {
-        try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength());
-             HostMemoryBuffer actual = HostMemoryBuffer.allocate(outputSizes[i])) {
-          Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(),
-              "uncompressed size mismatch at buffer " + i);
-          expected.copyFromDeviceBuffer(originalBuffers[i]);
-          actual.copyFromDeviceBuffer(uncompressedBuffers[i]);
-          byte[] expectedBytes = new byte[(int) expected.getLength()];
-          expected.getBytes(expectedBytes, 0, 0, expected.getLength());
-          byte[] actualBytes = new byte[(int) actual.getLength()];
-          actual.getBytes(actualBytes, 0, 0, actual.getLength());
-          Assertions.assertArrayEquals(expectedBytes, actualBytes,
-              "mismatch in batch buffer " + i);
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+               CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream));
+           CloseableArray<DeviceMemoryBuffer> uncompressedBuffers =
+               CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
+        for (int i = 0; i < numBuffers; i++) {
+          uncompressedBuffers.set(i,
+              DeviceMemoryBuffer.allocate(originalBuffers.get(i).getLength()));
         }
-      }
-    } finally {
-      closeBufferArray(originalBuffers);
-      closeBufferArray(uncompressedBuffers);
-      closeBufferArray(compressedBuffers);
-      closeBufferArray(compressedInputs);
-    }
-  }
-
-  @Test
-  void testBatchedLZ4CompressRoundTrip() {
-    final long chunkSize = 64 * 1024;
-    final int maxElements = 1024 * 1024 + 1;
-    final int numBuffers = 200;
-    long[] data = new long[maxElements];
-    for (int i = 0; i < maxElements; ++i) {
-      data[i] = i;
-    }
-
-    DeviceMemoryBuffer[] originalBuffers = new DeviceMemoryBuffer[numBuffers];
-    DeviceMemoryBuffer[] uncompressedBuffers = new DeviceMemoryBuffer[numBuffers];
-    BatchedLZ4Compressor.BatchedCompressionResult compResult = null;
-
-    // compressed data in right-sized buffers
-    DeviceMemoryBuffer[] compressedInputs = new DeviceMemoryBuffer[numBuffers];
-
-    try {
-      // create the batched buffers to compress
-      for (int i = 0; i < numBuffers; ++i) {
-        originalBuffers[i] = initBatchBuffer(data, i);
-      }
 
-      // compress the buffers
-      compResult = BatchedLZ4Compressor.compress(originalBuffers, chunkSize, Cuda.DEFAULT_STREAM);
+        // decompress takes ownership of the compressed buffers and will close them
+        BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(),
+            uncompressedBuffers.getArray(), stream);
 
-      // right-size the compressed buffers based on reported compressed sizes
-      DeviceMemoryBuffer[] compressedBuffers = compResult.getCompressedBuffers();
-      long[] compressedSizes = compResult.getCompressedSizes();
-      for (int i = 0; i < numBuffers; ++i) {
-        compressedInputs[i] = compressedBuffers[i].slice(0, compressedSizes[i]);
-      }
-
-      // decompress the buffers
-      uncompressedBuffers = BatchedLZ4Decompressor.decompressAsync(compressedInputs,
-              Cuda.DEFAULT_STREAM);
-
-      // check the decompressed results against the original
-      for (int i = 0; i < numBuffers; ++i) {
-        try (HostMemoryBuffer expected = HostMemoryBuffer.allocate(originalBuffers[i].getLength());
-             HostMemoryBuffer actual = HostMemoryBuffer.allocate(uncompressedBuffers[i].getLength())) {
-          Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
-          Assertions.assertEquals(originalBuffers[i].getLength(), uncompressedBuffers[i].getLength(),
-                  "uncompressed size mismatch at buffer " + i);
-          expected.copyFromDeviceBuffer(originalBuffers[i]);
-          actual.copyFromDeviceBuffer(uncompressedBuffers[i]);
-          byte[] expectedBytes = new byte[(int) expected.getLength()];
-          expected.getBytes(expectedBytes, 0, 0, expected.getLength());
-          byte[] actualBytes = new byte[(int) actual.getLength()];
-          actual.getBytes(actualBytes, 0, 0, actual.getLength());
-          Assertions.assertArrayEquals(expectedBytes, actualBytes,
-                  "mismatch in batch buffer " + i);
+        // check the decompressed results against the original
+        for (int i = 0; i < numBuffers; ++i) {
+          try (HostMemoryBuffer expected =
+                   HostMemoryBuffer.allocate(originalBuffers.get(i).getLength());
+               HostMemoryBuffer actual =
+                   HostMemoryBuffer.allocate(uncompressedBuffers.get(i).getLength())) {
+            Assertions.assertTrue(expected.getLength() <= Integer.MAX_VALUE);
+            Assertions.assertTrue(actual.getLength() <= Integer.MAX_VALUE);
+            Assertions.assertEquals(expected.getLength(), actual.getLength(),
+                "uncompressed size mismatch at buffer " + i);
+            expected.copyFromDeviceBuffer(originalBuffers.get(i));
+            actual.copyFromDeviceBuffer(uncompressedBuffers.get(i));
+            byte[] expectedBytes = new byte[(int) expected.getLength()];
+            expected.getBytes(expectedBytes, 0, 0, expected.getLength());
+            byte[] actualBytes = new byte[(int) actual.getLength()];
+            actual.getBytes(actualBytes, 0, 0, actual.getLength());
+            Assertions.assertArrayEquals(expectedBytes, actualBytes,
+                "mismatch in batch buffer " + i);
+          }
         }
       }
-    } finally {
-      closeBufferArray(originalBuffers);
-      closeBufferArray(uncompressedBuffers);
-      closeBufferArray(compressedInputs);
-      if (compResult != null) {
-        closeBufferArray(compResult.getCompressedBuffers());
-      }
     }
   }
 
@@ -200,14 +105,6 @@ private void closeBuffer(MemoryBuffer buffer) {
     }
   }
 
-  private void closeBufferArray(MemoryBuffer[] buffers) {
-    for (MemoryBuffer buffer : buffers) {
-      if (buffer != null) {
-        buffer.close();
-      }
-    }
-  }
-
   private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
     // grab a subsection of the data based on buffer ID
     int dataStart = 0;
@@ -239,6 +136,7 @@ private DeviceMemoryBuffer initBatchBuffer(long[] data, int bufferId) {
   }
 
   private void lz4RoundTrip(boolean useAsync) {
+    final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
     final long chunkSize = 64 * 1024;
     final int numElements = 10 * 1024 * 1024 + 1;
     long[] data = new long[numElements];
@@ -251,31 +149,32 @@ private void lz4RoundTrip(boolean useAsync) {
     DeviceMemoryBuffer uncompressedBuffer = null;
     try (ColumnVector v = ColumnVector.fromLongs(data)) {
       BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA);
-      log.debug("Uncompressed size is {}", inputBuffer.getLength());
-
-      long tempSize = LZ4Compressor.getTempSize(inputBuffer, CompressionType.CHAR, chunkSize);
-
-      log.debug("Using {} temporary space for lz4 compression", tempSize);
-      tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
+      final long uncompressedSize = inputBuffer.getLength();
+      log.debug("Uncompressed size is {}", uncompressedSize);
 
-      long outSize = LZ4Compressor.getOutputSize(inputBuffer, CompressionType.CHAR, chunkSize,
-          tempBuffer);
-      log.debug("lz4 compressed size estimate is {}", outSize);
+      LZ4Compressor.Configuration compressConf =
+          LZ4Compressor.configure(chunkSize, uncompressedSize);
+      Assertions.assertTrue(compressConf.getMetadataBytes() > 0);
+      log.debug("Using {} temporary space for lz4 compression", compressConf.getTempBytes());
+      tempBuffer = DeviceMemoryBuffer.allocate(compressConf.getTempBytes());
+      log.debug("lz4 compressed size estimate is {}", compressConf.getMaxCompressedBytes());
 
-      compressedBuffer = DeviceMemoryBuffer.allocate(outSize);
+      compressedBuffer = DeviceMemoryBuffer.allocate(compressConf.getMaxCompressedBytes());
 
       long startTime = System.nanoTime();
       long compressedSize;
       if (useAsync) {
-        try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) {
-          LZ4Compressor.compressAsync(tempHostBuffer, inputBuffer, CompressionType.CHAR, chunkSize,
-              tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSize = tempHostBuffer.getLong(0);
+        try (DeviceMemoryBuffer devCompressedSizeBuffer = DeviceMemoryBuffer.allocate(8);
+             HostMemoryBuffer hostCompressedSizeBuffer = HostMemoryBuffer.allocate(8)) {
+          LZ4Compressor.compressAsync(devCompressedSizeBuffer, inputBuffer, CompressionType.CHAR,
+              chunkSize, tempBuffer, compressedBuffer, stream);
+          hostCompressedSizeBuffer.copyFromDeviceBufferAsync(devCompressedSizeBuffer, stream);
+          stream.sync();
+          compressedSize = hostCompressedSizeBuffer.getLong(0);
         }
       } else {
         compressedSize = LZ4Compressor.compress(inputBuffer, CompressionType.CHAR, chunkSize,
-            tempBuffer, compressedBuffer, Cuda.DEFAULT_STREAM);
+            tempBuffer, compressedBuffer, stream);
       }
       double duration = (System.nanoTime() - startTime) / 1000.0;
       log.info("Compressed with lz4 to {} in {} us", compressedSize, duration);
@@ -283,23 +182,20 @@ private void lz4RoundTrip(boolean useAsync) {
       tempBuffer.close();
       tempBuffer = null;
 
-      Assertions.assertTrue(Decompressor.isLZ4Data(compressedBuffer));
-
-      try (Decompressor.Metadata metadata =
-               Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) {
-        Assertions.assertTrue(metadata.isLZ4Metadata());
-        tempSize = Decompressor.getTempSize(metadata);
+      try (LZ4Decompressor.Configuration decompressConf =
+               LZ4Decompressor.configure(compressedBuffer, stream)) {
+        final long tempSize = decompressConf.getTempBytes();
 
         log.debug("Using {} temporary space for lz4 compression", tempSize);
         tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
 
-        outSize = Decompressor.getOutputSize(metadata);
+        final long outSize = decompressConf.getUncompressedBytes();
         Assertions.assertEquals(inputBuffer.getLength(), outSize);
 
         uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize);
 
-        Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer,
-            Cuda.DEFAULT_STREAM);
+        LZ4Decompressor.decompressAsync(compressedBuffer, decompressConf, tempBuffer,
+            uncompressedBuffer, stream);
 
         try (ColumnVector v2 = new ColumnVector(
             DType.INT64,
@@ -324,133 +220,4 @@ private void lz4RoundTrip(boolean useAsync) {
       closeBuffer(uncompressedBuffer);
     }
   }
-
-  @Test
-  void testCascadedRoundTripSync() {
-    cascadedRoundTrip(false);
-  }
-
-  @Test
-  void testCascadedRoundTripAsync() {
-    cascadedRoundTrip(true);
-  }
-
-  private void cascadedRoundTrip(boolean useAsync) {
-    final int numElements = 10 * 1024 * 1024 + 1;
-    final int numRunLengthEncodings = 2;
-    final int numDeltas = 1;
-    final boolean useBitPacking = true;
-    int[] data = new int[numElements];
-    for (int i = 0; i < numElements; ++i) {
-      data[i] = i;
-    }
-
-    DeviceMemoryBuffer tempBuffer = null;
-    DeviceMemoryBuffer compressedBuffer = null;
-    DeviceMemoryBuffer uncompressedBuffer = null;
-    try (ColumnVector v = ColumnVector.fromInts(data)) {
-      BaseDeviceMemoryBuffer inputBuffer = v.getDeviceBufferFor(BufferType.DATA);
-      log.debug("Uncompressed size is " + inputBuffer.getLength());
-
-      long tempSize = NvcompJni.cascadedCompressGetTempSize(
-          inputBuffer.getAddress(),
-          inputBuffer.getLength(),
-          CompressionType.INT.nativeId,
-          numRunLengthEncodings,
-          numDeltas,
-          useBitPacking);
-
-      log.debug("Using {} temporary space for cascaded compression", tempSize);
-      tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
-
-      long outSize = NvcompJni.cascadedCompressGetOutputSize(
-          inputBuffer.getAddress(),
-          inputBuffer.getLength(),
-          CompressionType.INT.nativeId,
-          numRunLengthEncodings,
-          numDeltas,
-          useBitPacking,
-          tempBuffer.getAddress(),
-          tempBuffer.getLength(),
-          false);
-      log.debug("Inexact cascaded compressed size estimate is {}", outSize);
-
-      compressedBuffer = DeviceMemoryBuffer.allocate(outSize);
-
-      long startTime = System.nanoTime();
-      long compressedSize;
-      if (useAsync) {
-        try (HostMemoryBuffer tempHostBuffer = HostMemoryBuffer.allocate(8)) {
-          NvcompJni.cascadedCompressAsync(
-              tempHostBuffer.getAddress(),
-              inputBuffer.getAddress(),
-              inputBuffer.getLength(),
-              CompressionType.INT.nativeId,
-              numRunLengthEncodings,
-              numDeltas,
-              useBitPacking,
-              tempBuffer.getAddress(),
-              tempBuffer.getLength(),
-              compressedBuffer.getAddress(),
-              compressedBuffer.getLength(),
-              0);
-          Cuda.DEFAULT_STREAM.sync();
-          compressedSize = tempHostBuffer.getLong(0);
-        }
-      } else {
-        compressedSize = NvcompJni.cascadedCompress(
-            inputBuffer.getAddress(),
-            inputBuffer.getLength(),
-            CompressionType.INT.nativeId,
-            numRunLengthEncodings,
-            numDeltas,
-            useBitPacking,
-            tempBuffer.getAddress(),
-            tempBuffer.getLength(),
-            compressedBuffer.getAddress(),
-            compressedBuffer.getLength(),
-            0);
-      }
-
-      double duration = (System.nanoTime() - startTime) / 1000.0;
-      log.debug("Compressed with cascaded to {} in {} us", compressedSize, duration);
-
-      tempBuffer.close();
-      tempBuffer = null;
-
-      try (Decompressor.Metadata metadata =
-               Decompressor.getMetadata(compressedBuffer, Cuda.DEFAULT_STREAM)) {
-        tempSize = Decompressor.getTempSize(metadata);
-
-        log.debug("Using {} temporary space for cascaded compression", tempSize);
-        tempBuffer = DeviceMemoryBuffer.allocate(tempSize);
-
-        outSize = Decompressor.getOutputSize(metadata);
-        Assertions.assertEquals(inputBuffer.getLength(), outSize);
-
-        uncompressedBuffer = DeviceMemoryBuffer.allocate(outSize);
-
-        Decompressor.decompressAsync(compressedBuffer, tempBuffer, metadata, uncompressedBuffer,
-            Cuda.DEFAULT_STREAM);
-
-        try (ColumnVector v2 = new ColumnVector(
-            DType.INT32,
-            numElements,
-            Optional.empty(),
-            uncompressedBuffer,
-            null,
-            null)) {
-          uncompressedBuffer = null;
-          try (ColumnVector compare = v2.equalTo(v);
-               Scalar compareAll = compare.all()) {
-            Assertions.assertTrue(compareAll.getBoolean());
-          }
-        }
-      }
-    } finally {
-      closeBuffer(tempBuffer);
-      closeBuffer(compressedBuffer);
-      closeBuffer(uncompressedBuffer);
-    }
-  }
 }
diff --git a/python/.flake8 b/python/.flake8
index 721558764af..c645c46a216 100644
--- a/python/.flake8
+++ b/python/.flake8
@@ -4,14 +4,19 @@
 exclude = __init__.py
 ignore =
     # line break before binary operator
-    W503
+    W503,
     # whitespace before :
     E203
 
 [pydocstyle]
-match = ^(.*abc\.py|types\.py)$
-#match = ^(types\.py)$
+match = ^(.*abc\.py|.*api/types\.py|.*single_column_frame\.py|.*indexed_frame\.py)$
 # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir.
 match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$
-# In addition to numpy style, we additionally ignore magic methods (D105) and newlines before docstrings (D204).
-add-ignore = D105, D204
+# In addition to numpy style, we additionally ignore:
+add-ignore =
+    # magic methods
+    D105,
+    # no docstring in __init__
+    D107,
+    # newlines before docstrings
+    D204
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index df09a72ce25..bc35551b5bd 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -98,6 +98,7 @@
     read_parquet,
     read_text,
 )
+from cudf.core.tools.datetimes import date_range
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import set_allocator
 
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 0e68f1c71cc..ff5870c50be 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -116,6 +116,8 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
             )
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
+        elif dtype == "decimal32":
+            meta["max_precision"] = cudf.Decimal32Dtype.MAX_PRECISION
 
         meta["dtype"] = dtype
         meta["null_frequency"] = null_frequency
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bafa1c914fd..dafaa8f4d1d 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Optional, Tuple, TypeVar, Union
+from typing import Dict, Optional, Tuple, TypeVar
 
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
@@ -21,6 +21,7 @@ class Column:
     _null_count: int
     _children: Tuple[ColumnBase, ...]
     _base_children: Tuple[ColumnBase, ...]
+    _distinct_count: Dict[bool, int]
 
     def __init__(
         self,
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 3103dbe81b7..ff3f3050e63 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -80,6 +80,7 @@ cdef class Column:
 
         self._size = size
         self._cached_sizeof = None
+        self._distinct_count = {}
         self._dtype = dtype
         self._offset = offset
         self._null_count = null_count
@@ -203,9 +204,14 @@ cdef class Column:
                 raise ValueError(error_msg)
 
         self._mask = None
-        self._null_count = None
         self._children = None
         self._base_mask = value
+        self._clear_cache()
+
+    def _clear_cache(self):
+        self._distinct_count = {}
+        self._cached_sizeof = None
+        self._null_count = None
 
     def set_mask(self, value):
         """
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 85faa08621b..c892c100bf6 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -247,47 +247,6 @@ def scatter(object source, Column scatter_map, Column target_column,
     return next(iter(data.values()))
 
 
-def _reverse_column(Column source_column):
-    cdef column_view reverse_column_view = source_column.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_copying.reverse(
-            reverse_column_view
-        ))
-
-    return Column.from_unique_ptr(
-        move(c_result)
-    )
-
-
-def _reverse_table(source_table):
-    cdef table_view reverse_table_view = table_view_from_columns(source_table)
-
-    cdef unique_ptr[table] c_result
-    with nogil:
-        c_result = move(cpp_copying.reverse(
-            reverse_table_view
-        ))
-
-    return data_from_unique_ptr(
-        move(c_result),
-        column_names=source_table._column_names,
-        index_names=source_table._index_names
-    )
-
-
-def reverse(object source):
-    """
-    Reversing a column or a table
-    """
-    if isinstance(source, Column):
-        return _reverse_column(source)
-    else:
-        return _reverse_table(source)
-
-
 def column_empty_like(Column input_column):
 
     cdef column_view input_column_view = input_column.view()
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index a318dc68ac9..be1b6d8069c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -34,14 +34,6 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         out_of_bounds_policy policy
     ) except +
 
-    cdef unique_ptr[table] reverse (
-        const table_view& source_table
-    ) except +
-
-    cdef unique_ptr[column] reverse (
-        const column_view& source_column
-    ) except +
-
     cdef unique_ptr[column] shift(
         const column_view& input,
         size_type offset,
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index c27eb324008..2af4dd648c5 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -2,6 +2,7 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/filling.pxd b/python/cudf/cudf/_lib/cpp/filling.pxd
index 42bdd827452..4233ab60ff2 100644
--- a/python/cudf/cudf/_lib/cpp/filling.pxd
+++ b/python/cudf/cudf/_lib/cpp/filling.pxd
@@ -42,3 +42,9 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
         const scalar & init,
         const scalar & step
     ) except +
+
+    cdef unique_ptr[column] calendrical_month_sequence(
+        size_type n,
+        const scalar& init,
+        size_type months,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 3036b000c5b..c855f112692 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -69,12 +69,18 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.sink_info get_sink() except+
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
+        size_t stripe_size_bytes() except+
+        size_type stripe_size_rows() except+
+        size_type row_index_stride() except+
         cudf_table_view.table_view get_table() except+
         const cudf_io_types.table_input_metadata *get_metadata() except+
 
         # setter
         void set_compression(cudf_io_types.compression_type comp) except+
         void enable_statistics(bool val) except+
+        void set_stripe_size_bytes(size_t val) except+
+        void set_stripe_size_rows(size_type val) except+
+        void set_row_index_stride(size_type val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(cudf_io_types.table_input_metadata* meta) except+
 
@@ -90,6 +96,9 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_io_types.compression_type comp
         ) except+
         orc_writer_options_builder& enable_statistics(bool val) except+
+        orc_writer_options_builder& stripe_size_bytes(size_t val) except+
+        orc_writer_options_builder& stripe_size_rows(size_type val) except+
+        orc_writer_options_builder& row_index_stride(size_type val) except+
         orc_writer_options_builder& table(
             cudf_table_view.table_view tbl
         ) except+
@@ -106,6 +115,9 @@ cdef extern from "cudf/io/orc.hpp" \
         cudf_io_types.sink_info get_sink() except+
         cudf_io_types.compression_type get_compression() except+
         bool enable_statistics() except+
+        size_t stripe_size_bytes() except+
+        size_type stripe_size_rows() except+
+        size_type row_index_stride() except+
         cudf_table_view.table_view get_table() except+
         const cudf_io_types.table_input_metadata *get_metadata(
         ) except+
@@ -113,6 +125,9 @@ cdef extern from "cudf/io/orc.hpp" \
         # setter
         void set_compression(cudf_io_types.compression_type comp) except+
         void enable_statistics(bool val) except+
+        void set_stripe_size_bytes(size_t val) except+
+        void set_stripe_size_rows(size_type val) except+
+        void set_row_index_stride(size_type val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(
             cudf_io_types.table_input_metadata* meta
@@ -129,6 +144,9 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_io_types.compression_type comp
         ) except+
         chunked_orc_writer_options_builder& enable_statistics(bool val) except+
+        orc_writer_options_builder& stripe_size_bytes(size_t val) except+
+        orc_writer_options_builder& stripe_size_rows(size_type val) except+
+        orc_writer_options_builder& row_index_stride(size_type val) except+
         chunked_orc_writer_options_builder& table(
             cudf_table_view.table_view tbl
         ) except+
diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
index 02a4469f495..d193a8265b1 100644
--- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.cpp.column.column cimport column
@@ -11,3 +11,6 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] title(
         const column_view & strings) except +
+
+    cdef unique_ptr[column] is_title(
+        const column_view & strings) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/cpp/strings/contains.pxd
index bde0b4fdfb7..8014a60617d 100644
--- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/contains.pxd
@@ -7,16 +7,27 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 
 
+cdef extern from "cudf/strings/regex/flags.hpp" \
+        namespace "cudf::strings" nogil:
+
+    ctypedef enum regex_flags:
+        DEFAULT 'cudf::strings::regex_flags::DEFAULT'
+        MULTILINE  'cudf::strings::regex_flags::MULTILINE'
+        DOTALL 'cudf::strings::regex_flags::DOTALL'
+
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] contains_re(
         column_view source_strings,
-        string pattern) except +
+        string pattern,
+        regex_flags flags) except +
 
     cdef unique_ptr[column] count_re(
         column_view source_strings,
-        string pattern) except +
+        string pattern,
+        regex_flags flags) except +
 
     cdef unique_ptr[column] matches_re(
         column_view source_strings,
-        string pattern) except +
+        string pattern,
+        regex_flags flags) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd b/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
new file mode 100644
index 00000000000..99bb80a813d
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
+        "cudf::strings" nogil:
+
+    cdef unique_ptr[column] format_list_column(
+        column_view input_col,
+        string_scalar na_rep,
+        column_view separators) except +
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd
index 907a85ed593..3153427ce3c 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/cpp/transform.pxd
@@ -43,3 +43,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input
     ) except +
+
+    cdef pair[unique_ptr[column], table_view] one_hot_encode(
+        column_view input_column,
+        column_view categories
+    )
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
index 74efdb08bea..628ffef433b 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
@@ -5,6 +5,8 @@ from libc.stdint cimport int32_t, int64_t
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
     # cython type stub to help resolve to numeric::decimal64
     ctypedef int64_t decimal64
+    # cython type stub to help resolve to numeric::decimal32
+    ctypedef int64_t decimal32
 
     cdef cppclass scale_type:
         scale_type(int32_t)
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 4921d1b4ace..5cda06362b6 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -5,6 +5,9 @@ cimport cudf._lib.cpp.datetime as libcudf_datetime
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.filling cimport calendrical_month_sequence
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.scalar cimport DeviceScalar
 
 
 def add_months(Column col, Column months):
@@ -98,6 +101,22 @@ def is_leap_year(Column col):
     return Column.from_unique_ptr(move(c_result))
 
 
+def date_range(DeviceScalar start, size_type n, offset):
+    cdef unique_ptr[column] c_result
+    cdef size_type months = (
+        offset.kwds.get("years", 0) * 12
+        + offset.kwds.get("months", 0)
+    )
+
+    with nogil:
+        c_result = move(calendrical_month_sequence(
+            n,
+            start.c_value.get()[0],
+            months
+        ))
+    return Column.from_unique_ptr(move(c_result))
+
+
 def extract_quarter(Column col):
     """
     Returns a column which contains the corresponding quarter of the year
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index ae0116e635b..9b34a049cac 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -54,17 +54,23 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, object initial_hash_values=None, int seed=0):
-    cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
+def hash(source_table, str method, object initial_hash=None, int seed=0):
+    cdef vector[uint32_t] c_initial_hash = initial_hash or []
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
-
     cdef unique_ptr[column] c_result
+    cdef libcudf_types.hash_id c_hash_function
+    if method == "murmur3":
+        c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
+    elif method == "md5":
+        c_hash_function = libcudf_types.hash_id.HASH_MD5
+    else:
+        raise ValueError(f"Unsupported hash function: {method}")
     with nogil:
         c_result = move(
             cpp_hash(
                 c_source_view,
-                libcudf_types.hash_id.HASH_MURMUR3,
+                c_hash_function,
                 c_initial_hash,
                 seed
             )
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 83c7f9c8635..1281aa172b4 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -34,6 +34,7 @@ from cudf._lib.cpp.io.types cimport (
 )
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
+from cudf._lib.io.datasource cimport NativeFileDatasource
 from cudf._lib.io.utils cimport (
     make_sink_info,
     make_source_info,
@@ -53,6 +54,8 @@ from cudf._lib.utils cimport (
     table_view_from_table,
 )
 
+from pyarrow.lib import NativeFile
+
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from cudf.api.types import is_list_dtype, is_struct_dtype
 
@@ -66,6 +69,10 @@ cpdef read_raw_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
+    # Handle NativeFile input
+    if isinstance(filepath_or_buffer, NativeFile):
+        filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
+
     cdef raw_orc_statistics raw = (
         libcudf_read_raw_orc_statistics(make_source_info([filepath_or_buffer]))
     )
@@ -139,7 +146,10 @@ cdef compression_type _get_comp_type(object compression):
 cpdef write_orc(table,
                 object path_or_buf,
                 object compression=None,
-                bool enable_statistics=True):
+                bool enable_statistics=True,
+                object stripe_size_bytes=None,
+                object stripe_size_rows=None,
+                object row_index_stride=None):
     """
     Cython function to call into libcudf API, see `write_orc`.
 
@@ -181,6 +191,12 @@ cpdef write_orc(table,
         .enable_statistics(<bool> (True if enable_statistics else False))
         .build()
     )
+    if stripe_size_bytes is not None:
+        c_orc_writer_options.set_stripe_size_bytes(stripe_size_bytes)
+    if stripe_size_rows is not None:
+        c_orc_writer_options.set_stripe_size_rows(stripe_size_rows)
+    if row_index_stride is not None:
+        c_orc_writer_options.set_row_index_stride(row_index_stride)
 
     with nogil:
         libcudf_write_orc(c_orc_writer_options)
@@ -209,6 +225,9 @@ cdef orc_reader_options make_orc_reader_options(
     object decimal_cols_as_float
 ) except*:
 
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
     cdef vector[string] c_column_names
     cdef vector[vector[size_type]] strps = stripes
     c_column_names.reserve(len(column_names))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 43bc1ac9db7..9c24e5becfd 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -118,9 +118,17 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cudf.io.parquet.read_parquet
     cudf.io.parquet.to_parquet
     """
+
+    # Convert NativeFile buffers to NativeFileDatasource,
+    # but save original buffers in case we need to use
+    # pyarrow for metadata processing
+    # (See: https://github.com/rapidsai/cudf/issues/9599)
+    pa_buffers = []
     for i, datasource in enumerate(filepaths_or_buffers):
         if isinstance(datasource, NativeFile):
+            pa_buffers.append(datasource)
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+
     cdef cudf_io_types.source_info source = make_source_info(
         filepaths_or_buffers)
 
@@ -192,7 +200,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     # update the decimal precision of each column
     if meta is not None:
         for col, col_meta in zip(column_names, meta["columns"]):
-            if isinstance(df._data[col].dtype, cudf.Decimal64Dtype):
+            if is_decimal_dtype(df._data[col].dtype):
                 df._data[col].dtype.precision = (
                     col_meta["metadata"]["precision"]
                 )
@@ -203,7 +211,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             range_index_meta = index_col[0]
             if row_groups is not None:
                 per_file_metadata = [
-                    pa.parquet.read_metadata(s) for s in filepaths_or_buffers
+                    pa.parquet.read_metadata(s) for s in (
+                        pa_buffers or filepaths_or_buffers
+                    )
                 ]
 
                 filtered_idx = []
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index ab53a242db2..21a039dbf78 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -2,7 +2,6 @@
 
 import cudf
 from cudf.api.types import is_decimal_dtype
-from cudf.core.dtypes import Decimal64Dtype
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
@@ -18,7 +17,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 
 from cudf._lib.aggregation cimport Aggregation, make_aggregation
-from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id
+from cudf._lib.types cimport (
+    dtype_to_data_type,
+    is_decimal_type_id,
+    underlying_type_t_type_id,
+)
 
 import numpy as np
 
@@ -72,11 +75,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             c_out_dtype
         ))
 
-    if c_result.get()[0].type().id() == libcudf_types.type_id.DECIMAL64:
+    if is_decimal_type_id(c_result.get()[0].type().id()):
         scale = -c_result.get()[0].type().scale()
         precision = _reduce_precision(col_dtype, reduction_op, len(incol))
         py_result = DeviceScalar.from_unique_ptr(
-            move(c_result), dtype=cudf.Decimal64Dtype(precision, scale)
+            move(c_result), dtype=col_dtype.__class__(precision, scale)
         )
     else:
         py_result = DeviceScalar.from_unique_ptr(move(c_result))
@@ -157,4 +160,4 @@ def _reduce_precision(dtype, op, nrows):
         new_p = 2 * p + nrows
     else:
         raise NotImplementedError()
-    return max(min(new_p, cudf.Decimal64Dtype.MAX_PRECISION), 0)
+    return max(min(new_p, dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
index 2d7f56dc5ce..e4311b356ec 100644
--- a/python/cudf/cudf/_lib/replace.pyx
+++ b/python/cudf/cudf/_lib/replace.pyx
@@ -204,12 +204,8 @@ def clip(Column input_col, object lo, object hi):
     and > hi will be replaced by hi
     """
 
-    lo_scalar = as_device_scalar(
-        lo, dtype=input_col.dtype if lo is None else None
-    )
-    hi_scalar = as_device_scalar(
-        hi, dtype=input_col.dtype if hi is None else None
-    )
+    lo_scalar = as_device_scalar(lo, dtype=input_col.dtype)
+    hi_scalar = as_device_scalar(hi, dtype=input_col.dtype)
 
     return clamp(input_col, lo_scalar, hi_scalar)
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index 1c683073277..d64d0543892 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -5,6 +5,7 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 4017c60683e..43c0198f80a 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -45,7 +45,7 @@ from cudf._lib.cpp.scalar.scalar cimport (
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.decimals cimport decimal64, scale_type
+from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, scale_type
 from cudf._lib.cpp.wrappers.durations cimport (
     duration_ms,
     duration_ns,
@@ -88,8 +88,8 @@ cdef class DeviceScalar:
         # IMPORTANT: this should only ever be called from __init__
         valid = not _is_null_host_scalar(value)
 
-        if isinstance(dtype, cudf.Decimal64Dtype):
-            _set_decimal64_from_scalar(
+        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
+            _set_decimal_from_scalar(
                 self.c_value, value, dtype, valid)
         elif isinstance(dtype, cudf.ListDtype):
             _set_list_from_pylist(
@@ -118,7 +118,7 @@ cdef class DeviceScalar:
             )
 
     def _to_host_scalar(self):
-        if isinstance(self.dtype, cudf.Decimal64Dtype):
+        if isinstance(self.dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             result = _get_py_decimal_from_fixed_point(self.c_value)
         elif cudf.api.types.is_struct_dtype(self.dtype):
             result = _get_py_dict_from_struct(self.c_value)
@@ -305,16 +305,25 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s,
     else:
         raise ValueError(f"dtype not supported: {dtype}")
 
-cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s,
-                                object value,
-                                object dtype,
-                                bool valid=True):
+cdef _set_decimal_from_scalar(unique_ptr[scalar]& s,
+                              object value,
+                              object dtype,
+                              bool valid=True):
     value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0
-    s.reset(
-        new fixed_point_scalar[decimal64](
-            <int64_t>np.int64(value), scale_type(-dtype.scale), valid
+    if isinstance(dtype, cudf.Decimal64Dtype):
+        s.reset(
+            new fixed_point_scalar[decimal64](
+                <int64_t>np.int64(value), scale_type(-dtype.scale), valid
+            )
         )
-    )
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        s.reset(
+            new fixed_point_scalar[decimal32](
+                <int32_t>np.int32(value), scale_type(-dtype.scale), valid
+            )
+        )
+    else:
+        raise ValueError(f"dtype not supported: {dtype}")
 
 cdef _set_struct_from_pydict(unique_ptr[scalar]& s,
                              object value,
@@ -450,6 +459,10 @@ cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s):
         rep_val = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].value())
         scale = int((<fixed_point_scalar[decimal64]*>s_ptr)[0].type().scale())
         return decimal.Decimal(rep_val).scaleb(scale)
+    elif cdtype.id() == libcudf_types.DECIMAL32:
+        rep_val = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].value())
+        scale = int((<fixed_point_scalar[decimal32]*>s_ptr)[0].type().scale())
+        return decimal.Decimal(rep_val).scaleb(scale)
     else:
         raise ValueError("Could not convert cudf::scalar to numpy scalar")
 
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 3d31c7443d4..3aa0b35e90e 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -113,7 +113,7 @@ def is_sorted(
     return c_result
 
 
-def order_by(source_table, object ascending, bool na_position):
+def order_by(source_table, object ascending, str na_position):
     """
     Sorting the table ascending/descending
 
@@ -123,27 +123,16 @@ def order_by(source_table, object ascending, bool na_position):
     ascending : list of boolean values which correspond to each column
                 in source_table signifying order of each column
                 True - Ascending and False - Descending
-    na_position : whether null should be considered larget or smallest value
-                  0 - largest and 1 - smallest
-
+    na_position : whether null value should show up at the "first" or "last"
+                position of **all** sorted column.
     """
-
     cdef table_view source_table_view = table_view_from_table(
         source_table, ignore_index=True
     )
     cdef vector[order] column_order
     column_order.reserve(len(ascending))
-    cdef null_order pred = (
-        null_order.BEFORE
-        if na_position == 1
-        else null_order.AFTER
-    )
-    cdef vector[null_order] null_precedence = (
-        vector[null_order](
-            source_table._num_columns,
-            pred
-        )
-    )
+    cdef vector[null_order] null_precedence
+    null_precedence.reserve(len(ascending))
 
     for i in ascending:
         if i is True:
@@ -151,6 +140,11 @@ def order_by(source_table, object ascending, bool na_position):
         else:
             column_order.push_back(order.DESCENDING)
 
+        if i ^ (na_position == "first"):
+            null_precedence.push_back(null_order.AFTER)
+        else:
+            null_precedence.push_back(null_order.BEFORE)
+
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = move(sorted_order(source_table_view,
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 598ac804dd6..fbc1538cc74 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -26,7 +26,7 @@
     count_bytes,
     count_characters,
 )
-from cudf._lib.strings.capitalize import capitalize, title
+from cudf._lib.strings.capitalize import capitalize, title, is_title
 from cudf._lib.strings.case import swapcase, to_lower, to_upper
 from cudf._lib.strings.char_types import (
     filter_alphanum,
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index bb1bf25ef7b..0bbdfa462e2 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -8,6 +8,7 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.strings.capitalize cimport (
     capitalize as cpp_capitalize,
+    is_title as cpp_is_title,
     title as cpp_title,
 )
 
@@ -30,3 +31,13 @@ def title(Column source_strings):
         c_result = move(cpp_title(source_view))
 
     return Column.from_unique_ptr(move(c_result))
+
+
+def is_title(Column source_strings):
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+
+    with nogil:
+        c_result = move(cpp_is_title(source_view))
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 1f622378280..f18d0eb7f36 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -11,11 +12,12 @@ from cudf._lib.cpp.strings.contains cimport (
     contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     matches_re as cpp_matches_re,
+    regex_flags as regex_flags,
 )
 from cudf._lib.scalar cimport DeviceScalar
 
 
-def contains_re(Column source_strings, object reg_ex):
+def contains_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column of boolean values with True for `source_strings`
     that contain regular expression `reg_ex`.
@@ -24,17 +26,19 @@ def contains_re(Column source_strings, object reg_ex):
     cdef column_view source_view = source_strings.view()
 
     cdef string reg_ex_string = <string>str(reg_ex).encode()
+    cdef regex_flags c_flags = <regex_flags>flags
 
     with nogil:
         c_result = move(cpp_contains_re(
             source_view,
-            reg_ex_string
+            reg_ex_string,
+            c_flags
         ))
 
     return Column.from_unique_ptr(move(c_result))
 
 
-def count_re(Column source_strings, object reg_ex):
+def count_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column with count of occurrences of `reg_ex` in
     each string of `source_strings`
@@ -43,17 +47,19 @@ def count_re(Column source_strings, object reg_ex):
     cdef column_view source_view = source_strings.view()
 
     cdef string reg_ex_string = <string>str(reg_ex).encode()
+    cdef regex_flags c_flags = <regex_flags>flags
 
     with nogil:
         c_result = move(cpp_count_re(
             source_view,
-            reg_ex_string
+            reg_ex_string,
+            c_flags
         ))
 
     return Column.from_unique_ptr(move(c_result))
 
 
-def match_re(Column source_strings, object reg_ex):
+def match_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column with each value True if the string matches `reg_ex`
     regular expression with each record of `source_strings`
@@ -62,11 +68,13 @@ def match_re(Column source_strings, object reg_ex):
     cdef column_view source_view = source_strings.view()
 
     cdef string reg_ex_string = <string>str(reg_ex).encode()
+    cdef regex_flags c_flags = <regex_flags>flags
 
     with nogil:
         c_result = move(cpp_matches_re(
             source_view,
-            reg_ex_string
+            reg_ex_string,
+            c_flags
         ))
 
     return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
new file mode 100644
index 00000000000..7ffa69cd680
--- /dev/null
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -0,0 +1,48 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.cpp.strings.convert.convert_lists cimport (
+    format_list_column as cpp_format_list_column,
+)
+
+from cudf._lib.scalar import as_device_scalar
+
+from cudf._lib.scalar cimport DeviceScalar
+
+
+def format_list_column(Column source_list, Column separators):
+    """
+    Format a list column of strings into a strings column.
+
+    Parameters
+    ----------
+    input_col : input column of type list with strings child.
+
+    separators: strings used for formatting (', ', '[', ']')
+
+    Returns
+    -------
+    Formatted strings column
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_list.view()
+    cdef column_view separators_view = separators.view()
+    # Use 'None' as null-replacment string
+    cdef DeviceScalar str_na_rep = as_device_scalar("None")
+    cdef const string_scalar* string_scalar_na_rep = <const string_scalar*>(
+        str_na_rep.get_raw_ptr())
+
+    with nogil:
+        c_result = move(cpp_format_list_column(
+            source_view, string_scalar_na_rep[0], separators_view
+        ))
+
+    return Column.from_unique_ptr(
+        move(c_result)
+    )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 215a4d5e2ae..a0eb7c68183 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -24,7 +24,11 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport (
+    data_from_table_view,
+    data_from_unique_ptr,
+    table_view_from_table,
+)
 
 
 def bools_to_mask(Column col):
@@ -158,3 +162,27 @@ def table_encode(input):
         ),
         Column.from_unique_ptr(move(c_result.second))
     )
+
+
+def one_hot_encode(Column input_column, Column categories):
+    cdef column_view c_view_input = input_column.view()
+    cdef column_view c_view_categories = categories.view()
+    cdef pair[unique_ptr[column], table_view] c_result
+
+    with nogil:
+        c_result = move(
+            libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
+        )
+
+    owner = Column.from_unique_ptr(move(c_result.first))
+
+    pylist_categories = categories.to_arrow().to_pylist()
+    encodings, _ = data_from_table_view(
+        move(c_result.second),
+        owner=owner,
+        column_names=[
+            x if x is not None else 'null' for x in pylist_categories
+        ]
+    )
+
+    return encodings
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index bea1164b655..b33a3cefba7 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -61,7 +61,7 @@ def transpose(source):
     if cats is not None:
         data= [
             (name, cudf.core.column.column.build_categorical_column(
-                codes=cudf.core.column.column.as_column(
+                codes=cudf.core.column.column.build_column(
                     col.base_data, dtype=col.dtype),
                 mask=col.base_mask,
                 size=col.size,
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index dbbe9b1e05a..58e3221a4ec 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -17,3 +17,4 @@ ctypedef bool underlying_type_t_null_policy
 cdef dtype_from_column_view(column_view cv)
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
+cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index e798d78d426..1fa389f408c 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -225,10 +225,13 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
             <underlying_type_t_type_id> (
                 SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)]))
 
-    if tid in (
-        libcudf_types.type_id.DECIMAL64,
-        libcudf_types.type_id.DECIMAL32
-    ):
+    if is_decimal_type_id(tid):
         return libcudf_types.data_type(tid, -dtype.scale)
     else:
         return libcudf_types.data_type(tid)
+
+cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
+    return tid in (
+        libcudf_types.type_id.DECIMAL64,
+        libcudf_types.type_id.DECIMAL32
+    )
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
index 85b4bf20e5c..b6089b65aa5 100644
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ b/python/cudf/cudf/comm/gpuarrow.py
@@ -129,13 +129,11 @@ def null_raw(self):
         return self._series._column.mask_array_view
 
     def make_series(self):
-        """Make a Series object out of this node
-        """
+        """Make a Series object out of this node"""
         return self._series.copy(deep=False)
 
     def _make_dictionary_series(self):
-        """Make a dictionary-encoded series from this node
-        """
+        """Make a dictionary-encoded series from this node"""
         assert self.is_dictionary
         return self._series.copy(deep=False)
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d18b536fa65..eea8e3c418f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -3,9 +3,9 @@
 from __future__ import annotations, division, print_function
 
 import pickle
+import warnings
 from typing import Any, Set
 
-import cupy
 import pandas as pd
 
 import cudf
@@ -261,98 +261,243 @@ def set_names(self, names, level=None, inplace=False):
 
         return self._set_names(names=names, inplace=inplace)
 
-    def fillna(self, value, downcast=None):
+    @property
+    def has_duplicates(self):
+        return not self.is_unique
+
+    def union(self, other, sort=None):
         """
-        Fill null values with the specified value.
+        Form the union of two Index objects.
 
         Parameters
         ----------
-        value : scalar
-            Scalar value to use to fill nulls. This value cannot be a
-            list-likes.
+        other : Index or array-like
+        sort : bool or None, default None
+            Whether to sort the resulting Index.
 
-        downcast : dict, default is None
-            This Parameter is currently NON-FUNCTIONAL.
+            * None : Sort the result, except when
+
+              1. `self` and `other` are equal.
+              2. `self` or `other` has length 0.
+
+            * False : do not sort the result.
 
         Returns
         -------
-        filled : Index
+        union : Index
 
         Examples
         --------
+        Union of an Index
         >>> import cudf
-        >>> index = cudf.Index([1, 2, None, 4])
-        >>> index
-        Int64Index([1, 2, null, 4], dtype='int64')
-        >>> index.fillna(3)
-        Int64Index([1, 2, 3, 4], dtype='int64')
+        >>> import pandas as pd
+        >>> idx1 = cudf.Index([1, 2, 3, 4])
+        >>> idx2 = cudf.Index([3, 4, 5, 6])
+        >>> idx1.union(idx2)
+        Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')
+
+        MultiIndex case
+
+        >>> idx1 = cudf.MultiIndex.from_pandas(
+        ...    pd.MultiIndex.from_arrays(
+        ...         [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
+        ...    )
+        ... )
+        >>> idx1
+        MultiIndex([(1,  'Red'),
+                    (1, 'Blue'),
+                    (2,  'Red'),
+                    (2, 'Blue')],
+                   )
+        >>> idx2 = cudf.MultiIndex.from_pandas(
+        ...    pd.MultiIndex.from_arrays(
+        ...         [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]
+        ...    )
+        ... )
+        >>> idx2
+        MultiIndex([(3,   'Red'),
+                    (3, 'Green'),
+                    (2,   'Red'),
+                    (2, 'Green')],
+                   )
+        >>> idx1.union(idx2)
+        MultiIndex([(1,  'Blue'),
+                    (1,   'Red'),
+                    (2,  'Blue'),
+                    (2, 'Green'),
+                    (2,   'Red'),
+                    (3, 'Green'),
+                    (3,   'Red')],
+                   )
+        >>> idx1.union(idx2, sort=False)
+        MultiIndex([(1,   'Red'),
+                    (1,  'Blue'),
+                    (2,   'Red'),
+                    (2,  'Blue'),
+                    (3,   'Red'),
+                    (3, 'Green'),
+                    (2, 'Green')],
+                   )
         """
-        if downcast is not None:
-            raise NotImplementedError(
-                "`downcast` parameter is not yet supported"
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(other, name=self.name)
+
+        if sort not in {None, False}:
+            raise ValueError(
+                f"The 'sort' keyword only takes the values of "
+                f"None or False; {sort} was passed."
             )
 
-        return super().fillna(value=value)
+        if not len(other) or self.equals(other):
+            return self._get_reconciled_name_object(other)
+        elif not len(self):
+            return other._get_reconciled_name_object(self)
 
-    def take(self, indices):
-        """Gather only the specific subset of indices
+        result = self._union(other, sort=sort)
+        result.name = _get_result_name(self.name, other.name)
+        return result
 
-        Parameters
-        ----------
-        indices: An array-like that maps to values contained in this Index.
+    def intersection(self, other, sort=False):
         """
-        return self[indices]
+        Form the intersection of two Index objects.
 
-    def argsort(self, ascending=True, **kwargs):
-        """
-        Return the integer indices that would sort the index.
+        This returns a new Index with elements common to the index and `other`.
 
         Parameters
         ----------
-        ascending : bool, default True
-            If True, returns the indices for ascending order.
-            If False, returns the indices for descending order.
+        other : Index or array-like
+        sort : False or None, default False
+            Whether to sort the resulting index.
+
+            * False : do not sort the result.
+            * None : sort the result, except when `self` and `other` are equal
+              or when the values cannot be compared.
 
         Returns
         -------
-        array : A cupy array containing Integer indices that
-            would sort the index if used as an indexer.
+        intersection : Index
 
         Examples
         --------
         >>> import cudf
-        >>> index = cudf.Index([10, 100, 1, 1000])
-        >>> index
-        Int64Index([10, 100, 1, 1000], dtype='int64')
-        >>> index.argsort()
-        array([2, 0, 1, 3], dtype=int32)
-
-        The order of argsort can be reversed using
-        ``ascending`` parameter, by setting it to ``False``.
-        >>> index.argsort(ascending=False)
-        array([3, 1, 0, 2], dtype=int32)
+        >>> import pandas as pd
+        >>> idx1 = cudf.Index([1, 2, 3, 4])
+        >>> idx2 = cudf.Index([3, 4, 5, 6])
+        >>> idx1.intersection(idx2)
+        Int64Index([3, 4], dtype='int64')
 
-        ``argsort`` on a MultiIndex:
+        MultiIndex case
 
-        >>> index = cudf.MultiIndex(
-        ...      levels=[[1, 3, 4, -10], [1, 11, 5]],
-        ...      codes=[[0, 0, 1, 2, 3], [0, 2, 1, 1, 0]],
-        ...      names=["x", "y"],
+        >>> idx1 = cudf.MultiIndex.from_pandas(
+        ...    pd.MultiIndex.from_arrays(
+        ...         [[1, 1, 3, 4], ["Red", "Blue", "Red", "Blue"]]
+        ...    )
         ... )
+        >>> idx2 = cudf.MultiIndex.from_pandas(
+        ...    pd.MultiIndex.from_arrays(
+        ...         [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
+        ...    )
+        ... )
+        >>> idx1
+        MultiIndex([(1,  'Red'),
+                    (1, 'Blue'),
+                    (3,  'Red'),
+                    (4, 'Blue')],
+                )
+        >>> idx2
+        MultiIndex([(1,  'Red'),
+                    (1, 'Blue'),
+                    (2,  'Red'),
+                    (2, 'Blue')],
+                )
+        >>> idx1.intersection(idx2)
+        MultiIndex([(1,  'Red'),
+                    (1, 'Blue')],
+                )
+        >>> idx1.intersection(idx2, sort=False)
+        MultiIndex([(1,  'Red'),
+                    (1, 'Blue')],
+                )
+        """
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(other, name=self.name)
+
+        if sort not in {None, False}:
+            raise ValueError(
+                f"The 'sort' keyword only takes the values of "
+                f"None or False; {sort} was passed."
+            )
+
+        if self.equals(other):
+            if self.has_duplicates:
+                return self.unique()._get_reconciled_name_object(other)
+            return self._get_reconciled_name_object(other)
+
+        res_name = _get_result_name(self.name, other.name)
+
+        if (self.is_boolean() and other.is_numeric()) or (
+            self.is_numeric() and other.is_boolean()
+        ):
+            if isinstance(self, cudf.MultiIndex):
+                return self[:0].rename(res_name)
+            else:
+                return cudf.Index([], name=res_name)
+
+        if self.has_duplicates:
+            lhs = self.unique()
+        else:
+            lhs = self
+        if other.has_duplicates:
+            rhs = other.unique()
+        else:
+            rhs = other
+        result = lhs._intersection(rhs, sort=sort)
+        result.name = res_name
+        return result
+
+    def _get_reconciled_name_object(self, other):
+        """
+        If the result of a set operation will be self,
+        return self, unless the name changes, in which
+        case make a shallow copy of self.
+        """
+        name = _get_result_name(self.name, other.name)
+        if self.name != name:
+            return self.rename(name)
+        return self
+
+    def fillna(self, value, downcast=None):
+        """
+        Fill null values with the specified value.
+
+        Parameters
+        ----------
+        value : scalar
+            Scalar value to use to fill nulls. This value cannot be a
+            list-likes.
+
+        downcast : dict, default is None
+            This Parameter is currently NON-FUNCTIONAL.
+
+        Returns
+        -------
+        filled : Index
+
+        Examples
+        --------
+        >>> import cudf
+        >>> index = cudf.Index([1, 2, None, 4])
         >>> index
-        MultiIndex([(  1,  1),
-                    (  1,  5),
-                    (  3, 11),
-                    (  4, 11),
-                    (-10,  1)],
-                   names=['x', 'y'])
-        >>> index.argsort()
-        array([4, 0, 1, 2, 3], dtype=int32)
-        >>> index.argsort(ascending=False)
-        array([3, 2, 1, 0, 4], dtype=int32)
+        Int64Index([1, 2, null, 4], dtype='int64')
+        >>> index.fillna(3)
+        Int64Index([1, 2, 3, 4], dtype='int64')
         """
-        indices = self._values.argsort(ascending=ascending, **kwargs)
-        return cupy.asarray(indices)
+        if downcast is not None:
+            raise NotImplementedError(
+                "`downcast` parameter is not yet supported"
+            )
+
+        return super().fillna(value=value)
 
     def to_frame(self, index=True, name=None):
         """Create a DataFrame with a column containing this Index
@@ -416,6 +561,10 @@ def gpu_values(self):
         """
         View the data as a numba device array object
         """
+        warnings.warn(
+            "The gpu_values property is deprecated and will be removed.",
+            FutureWarning,
+        )
         return self._values.data_array_view
 
     def append(self, other):
@@ -544,7 +693,289 @@ def difference(self, other, sort=None):
 
         return difference
 
-    def sort_values(self, return_indexer=False, ascending=True, key=None):
+    def is_numeric(self):
+        """
+        Check if the Index only consists of numeric data.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index only consists of numeric data.
+
+        See Also
+        --------
+        is_boolean : Check if the Index only consists of booleans.
+        is_integer : Check if the Index only consists of integers.
+        is_floating : Check if the Index is a floating type.
+        is_object : Check if the Index is of the object dtype.
+        is_categorical : Check if the Index holds categorical data.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0])
+        >>> idx.is_numeric()
+        True
+        >>> idx = cudf.Index([1, 2, 3, 4.0])
+        >>> idx.is_numeric()
+        True
+        >>> idx = cudf.Index([1, 2, 3, 4])
+        >>> idx.is_numeric()
+        True
+        >>> idx = cudf.Index([1, 2, 3, 4.0, np.nan])
+        >>> idx.is_numeric()
+        True
+        >>> idx = cudf.Index(["Apple", "cold"])
+        >>> idx.is_numeric()
+        False
+        """
+        raise NotImplementedError
+
+    def is_boolean(self):
+        """
+        Check if the Index only consists of booleans.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index only consists of booleans.
+
+        See Also
+        --------
+        is_integer : Check if the Index only consists of integers.
+        is_floating : Check if the Index is a floating type.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_object : Check if the Index is of the object dtype.
+        is_categorical : Check if the Index holds categorical data.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index([True, False, True])
+        >>> idx.is_boolean()
+        True
+        >>> idx = cudf.Index(["True", "False", "True"])
+        >>> idx.is_boolean()
+        False
+        >>> idx = cudf.Index([1, 2, 3])
+        >>> idx.is_boolean()
+        False
+        """
+        raise NotImplementedError
+
+    def is_integer(self):
+        """
+        Check if the Index only consists of integers.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index only consists of integers.
+
+        See Also
+        --------
+        is_boolean : Check if the Index only consists of booleans.
+        is_floating : Check if the Index is a floating type.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_object : Check if the Index is of the object dtype.
+        is_categorical : Check if the Index holds categorical data.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index([1, 2, 3, 4])
+        >>> idx.is_integer()
+        True
+        >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0])
+        >>> idx.is_integer()
+        False
+        >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"])
+        >>> idx.is_integer()
+        False
+        """
+        raise NotImplementedError
+
+    def is_floating(self):
+        """
+        Check if the Index is a floating type.
+
+        The Index may consist of only floats, NaNs, or a mix of floats,
+        integers, or NaNs.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index only consists of only consists
+            of floats, NaNs, or a mix of floats, integers, or NaNs.
+
+        See Also
+        --------
+        is_boolean : Check if the Index only consists of booleans.
+        is_integer : Check if the Index only consists of integers.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_object : Check if the Index is of the object dtype.
+        is_categorical : Check if the Index holds categorical data.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0])
+        >>> idx.is_floating()
+        True
+        >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0])
+        >>> idx.is_floating()
+        True
+        >>> idx = cudf.Index([1, 2, 3, 4, np.nan])
+        >>> idx.is_floating()
+        True
+        >>> idx = cudf.Index([1, 2, 3, 4])
+        >>> idx.is_floating()
+        False
+        """
+        raise NotImplementedError
+
+    def is_object(self):
+        """
+        Check if the Index is of the object dtype.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index is of the object dtype.
+
+        See Also
+        --------
+        is_boolean : Check if the Index only consists of booleans.
+        is_integer : Check if the Index only consists of integers.
+        is_floating : Check if the Index is a floating type.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_categorical : Check if the Index holds categorical data.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"])
+        >>> idx.is_object()
+        True
+        >>> idx = cudf.Index(["Watermelon", "Orange", "Apple",
+        ...                 "Watermelon"]).astype("category")
+        >>> idx.is_object()
+        False
+        >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0])
+        >>> idx.is_object()
+        False
+        """
+        raise NotImplementedError
+
+    def is_categorical(self):
+        """
+        Check if the Index holds categorical data.
+
+        Returns
+        -------
+        bool
+            True if the Index is categorical.
+
+        See Also
+        --------
+        CategoricalIndex : Index for categorical data.
+        is_boolean : Check if the Index only consists of booleans.
+        is_integer : Check if the Index only consists of integers.
+        is_floating : Check if the Index is a floating type.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_object : Check if the Index is of the object dtype.
+        is_interval : Check if the Index holds Interval objects.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(["Watermelon", "Orange", "Apple",
+        ...                 "Watermelon"]).astype("category")
+        >>> idx.is_categorical()
+        True
+        >>> idx = cudf.Index([1, 3, 5, 7])
+        >>> idx.is_categorical()
+        False
+        >>> s = cudf.Series(["Peter", "Victor", "Elisabeth", "Mar"])
+        >>> s
+        0        Peter
+        1       Victor
+        2    Elisabeth
+        3          Mar
+        dtype: object
+        >>> s.index.is_categorical()
+        False
+        """
+        raise NotImplementedError
+
+    def is_interval(self):
+        """
+        Check if the Index holds Interval objects.
+
+        Returns
+        -------
+        bool
+            Whether or not the Index holds Interval objects.
+
+        See Also
+        --------
+        IntervalIndex : Index for Interval objects.
+        is_boolean : Check if the Index only consists of booleans.
+        is_integer : Check if the Index only consists of integers.
+        is_floating : Check if the Index is a floating type.
+        is_numeric : Check if the Index only consists of numeric data.
+        is_object : Check if the Index is of the object dtype.
+        is_categorical : Check if the Index holds categorical data.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.from_pandas(
+        ...     pd.Index([pd.Interval(left=0, right=5),
+        ...               pd.Interval(left=5, right=10)])
+        ... )
+        >>> idx.is_interval()
+        True
+        >>> idx = cudf.Index([1, 3, 5, 7])
+        >>> idx.is_interval()
+        False
+        """
+        raise NotImplementedError
+
+    def _union(self, other, sort=None):
+        # TODO: As a future optimization we should explore
+        # not doing `to_frame`
+        self_df = self.to_frame(index=False, name=0)
+        other_df = other.to_frame(index=False, name=0)
+        self_df["order"] = self_df.index
+        other_df["order"] = other_df.index
+        res = self_df.merge(other_df, on=[0], how="outer")
+        res = res.sort_values(by=res.columns[1:], ignore_index=True)
+        union_result = cudf.core.index._index_from_data({0: res._data[0]})
+
+        if sort is None and len(other):
+            return union_result.sort_values()
+        return union_result
+
+    def _intersection(self, other, sort=None):
+        intersection_result = self.unique().join(other.unique(), how="inner")
+
+        if sort is None and len(other):
+            return intersection_result.sort_values()
+        return intersection_result
+
+    def sort_values(
+        self,
+        return_indexer=False,
+        ascending=True,
+        na_position="last",
+        key=None,
+    ):
         """
         Return a sorted copy of the index, and optionally return the indices
         that sorted the index itself.
@@ -555,6 +986,9 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
             Should the indices that would sort the index be returned.
         ascending : bool, default True
             Should the index values be sorted in an ascending order.
+        na_position : {'first' or 'last'}, default 'last'
+            Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at
+            the end.
         key : None, optional
             This parameter is NON-FUNCTIONAL.
 
@@ -620,12 +1054,14 @@ def sort_values(self, return_indexer=False, ascending=True, key=None):
         """
         if key is not None:
             raise NotImplementedError("key parameter is not yet implemented.")
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
 
-        indices = self._values.argsort(ascending=ascending)
-        index_sorted = cudf.Index(self.take(indices), name=self.name)
+        indices = self.argsort(ascending=ascending, na_position=na_position)
+        index_sorted = self.take(indices)
 
         if return_indexer:
-            return index_sorted, cupy.asarray(indices)
+            return index_sorted, indices
         else:
             return index_sorted
 
@@ -637,7 +1073,9 @@ def unique(self):
         -------
         Index without duplicates
         """
-        return cudf.Index(self._values.unique(), name=self.name)
+        return cudf.core.index._index_from_data(
+            {self.name: self._values.unique()}, name=self.name
+        )
 
     def join(
         self, other, how="left", level=None, return_indexers=False, sort=False
@@ -970,3 +1408,10 @@ def from_pandas(cls, index, nan_as_null=None):
     @property
     def _constructor_expanddim(self):
         return cudf.MultiIndex
+
+
+def _get_result_name(left_name, right_name):
+    if left_name == right_name:
+        return left_name
+    else:
+        return None
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index eef0f284f40..6c94a84fd37 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -10,9 +10,10 @@
 from cudf._typing import ColumnLike, ScalarLike
 from cudf.core.column import ColumnBase
 from cudf.core.dataframe import DataFrame
-from cudf.core.frame import Frame, SingleColumnFrame
+from cudf.core.frame import Frame
 from cudf.core.index import Index
 from cudf.core.series import Series
+from cudf.core.single_column_frame import SingleColumnFrame
 
 
 def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike:
@@ -308,7 +309,7 @@ def where(
                 ):
                     result = cudf.core.column.build_categorical_column(
                         categories=frame._data[column_name].categories,
-                        codes=cudf.core.column.as_column(
+                        codes=cudf.core.column.build_column(
                             result.base_data, dtype=result.dtype
                         ),
                         mask=result.base_mask,
@@ -367,7 +368,7 @@ def where(
                     cudf.core.column.CategoricalColumn,
                     frame._data[frame.name],
                 ).categories,
-                codes=cudf.core.column.as_column(
+                codes=cudf.core.column.build_column(
                     result.base_data, dtype=result.dtype
                 ),
                 mask=result.base_mask,
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index fa6c49284f0..18c86f82f9c 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -59,7 +59,7 @@ def factorize(values, sort=False, na_sentinel=-1, size_hint=None):
     cats = values._column.dropna().unique().astype(values.dtype)
 
     name = values.name  # label_encoding mutates self.name
-    labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values
+    labels = values._label_encoding(cats=cats, na_sentinel=na_sentinel).values
     values.name = name
 
     return labels, cats.values if return_cupy_array else Index(cats)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d2da594fa3b..a8e868ed521 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -713,7 +713,9 @@ def deserialize(cls, header: dict, frames: list) -> CategoricalColumn:
                 data=None,
                 dtype=dtype,
                 mask=mask,
-                children=(column.as_column(data.base_data, dtype=data.dtype),),
+                children=(
+                    column.build_column(data.base_data, dtype=data.dtype),
+                ),
             ),
         )
 
@@ -859,7 +861,7 @@ def slice(
         codes = self.codes.slice(start, stop, stride)
         return cudf.core.column.build_categorical_column(
             categories=self.categories,
-            codes=cudf.core.column.as_column(
+            codes=cudf.core.column.build_column(
                 codes.base_data, dtype=codes.dtype
             ),
             mask=codes.base_mask,
@@ -910,7 +912,7 @@ def sort_by_values(
         codes, inds = self.as_numerical.sort_by_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=column.as_column(codes.base_data, dtype=codes.dtype),
+            codes=column.build_column(codes.base_data, dtype=codes.dtype),
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -1001,7 +1003,7 @@ def unique(self) -> CategoricalColumn:
         codes = self.as_numerical.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=column.as_column(codes.base_data, dtype=codes.dtype),
+            codes=column.build_column(codes.base_data, dtype=codes.dtype),
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -1044,7 +1046,7 @@ def find_and_replace(
         df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
-            fill_value = df._data["new"][df._data["old"].isna()][0]
+            fill_value = df._data["new"][df._data["old"].isnull()][0]
             if fill_value in self.categories:
                 replaced = self.fillna(fill_value)
             else:
@@ -1060,7 +1062,7 @@ def find_and_replace(
         else:
             replaced = self
         if df._data["new"].null_count > 0:
-            drop_values = df._data["old"][df._data["new"].isna()]
+            drop_values = df._data["old"][df._data["new"].isnull()]
             cur_categories = replaced.categories
             new_categories = cur_categories[
                 ~cudf.Series(cur_categories.isin(drop_values))
@@ -1096,7 +1098,7 @@ def find_and_replace(
         # those categories don't exist anymore
         # Resetting the index creates a column 'index' that associates
         # the original integers to the new labels
-        bmask = new_cats._data["cats"].notna()
+        bmask = new_cats._data["cats"].notnull()
         new_cats = cudf.DataFrame(
             {"cats": new_cats._data["cats"].apply_boolean_mask(bmask)}
         ).reset_index()
@@ -1123,7 +1125,7 @@ def find_and_replace(
 
         return column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=column.as_column(output.base_data, dtype=output.dtype),
+            codes=column.build_column(output.base_data, dtype=output.dtype),
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1205,7 +1207,7 @@ def fillna(
 
         result = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=column.as_column(result.base_data, dtype=result.dtype),
+            codes=column.build_column(result.base_data, dtype=result.dtype),
             offset=result.offset,
             size=result.size,
             mask=result.base_mask,
@@ -1301,7 +1303,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn:
 
             return column.build_categorical_column(
                 categories=copied_cat,
-                codes=column.as_column(
+                codes=column.build_column(
                     copied_col.base_data, dtype=copied_col.dtype
                 ),
                 offset=copied_col.offset,
@@ -1312,7 +1314,7 @@ def copy(self, deep: bool = True) -> CategoricalColumn:
         else:
             return column.build_categorical_column(
                 categories=self.dtype.categories._values,
-                codes=column.as_column(
+                codes=column.build_column(
                     self.codes.base_data, dtype=self.codes.dtype
                 ),
                 mask=self.base_mask,
@@ -1374,7 +1376,9 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=column.as_column(codes_col.base_data, dtype=codes_col.dtype),
+            codes=column.build_column(
+                codes_col.base_data, dtype=codes_col.dtype
+            ),
             mask=codes_col.base_mask,
             size=codes_col.size,
             offset=codes_col.offset,
@@ -1386,7 +1390,7 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=column.as_column(
+                codes=column.build_column(
                     self.codes.base_data, dtype=self.codes.dtype
                 ),
                 mask=self.codes.base_mask,
@@ -1522,7 +1526,9 @@ def _set_categories(
         # codes can't have masks, so take mask out before moving in
         return column.build_categorical_column(
             categories=new_cats,
-            codes=column.as_column(new_codes.base_data, dtype=new_codes.dtype),
+            codes=column.build_column(
+                new_codes.base_data, dtype=new_codes.dtype
+            ),
             mask=new_codes.base_mask,
             size=new_codes.size,
             offset=new_codes.offset,
@@ -1609,7 +1615,7 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=column.as_column(codes.base_data, dtype=codes.dtype),
+        codes=column.build_column(codes.base_data, codes.dtype),
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 18216def17e..d8c99a87f92 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -62,7 +62,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.utils import ioutils, utils
+from cudf.utils import utils
 from cudf.utils.dtypes import (
     cudf_dtype_from_pa_type,
     get_time_unit,
@@ -81,7 +81,9 @@ def as_frame(self) -> "cudf.core.frame.Frame":
         """
         Converts a Column to Frame
         """
-        return cudf.core.frame.SingleColumnFrame({None: self.copy(deep=False)})
+        return cudf.core.single_column_frame.SingleColumnFrame(
+            {None: self.copy(deep=False)}
+        )
 
     @property
     def data_array_view(self) -> "cuda.devicearray.DeviceNDArray":
@@ -167,14 +169,9 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool:
             return True
         if other is None or len(self) != len(other):
             return False
-        if check_dtypes:
-            if self.dtype != other.dtype:
-                return False
-        null_equals = self._null_equals(other)
-        return null_equals.all()
-
-    def _null_equals(self, other: ColumnBase) -> ColumnBase:
-        return self.binary_operator("NULL_EQUALS", other)
+        if check_dtypes and (self.dtype != other.dtype):
+            return False
+        return self.binary_operator("NULL_EQUALS", other).all()
 
     def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
@@ -186,8 +183,8 @@ def all(self, skipna: bool = True) -> bool:
 
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
-        else:
-            return result_col
+
+        return result_col
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -199,8 +196,8 @@ def any(self, skipna: bool = True) -> bool:
 
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
-        else:
-            return result_col
+
+        return result_col
 
     def __sizeof__(self) -> int:
         n = 0
@@ -215,10 +212,7 @@ def dropna(self, drop_nan: bool = False) -> ColumnBase:
             col = self.nans_to_nulls()
         else:
             col = self
-        dropped_col = (
-            col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
-        )
-        return dropped_col
+        return col.as_frame()._drop_na_rows(drop_nan=drop_nan)._as_column()
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -312,10 +306,7 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         result = libcudf.interop.from_arrow(data, data.column_names)[0]["None"]
 
-        result = result._with_type_metadata(
-            cudf_dtype_from_pa_type(array.type)
-        )
-        return result
+        return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
 
     def _get_mask_as_column(self) -> ColumnBase:
         return libcudf.transform.mask_to_bools(
@@ -370,9 +361,6 @@ def to_array(self, fillna=None) -> np.ndarray:
 
         return self.to_gpu_array(fillna=fillna).copy_to_host()
 
-    def _reverse(self):
-        return libcudf.copying.reverse(self)
-
     def _fill(
         self,
         fill_value: ScalarLike,
@@ -412,12 +400,10 @@ def valid_count(self) -> int:
 
     @property
     def nullmask(self) -> Buffer:
-        """The gpu buffer for the null-mask
-        """
-        if self.nullable:
-            return self.mask_array_view
-        else:
+        """The gpu buffer for the null-mask"""
+        if not self.nullable:
             raise ValueError("Column has no null mask")
+        return self.mask_array_view
 
     def copy(self: T, deep: bool = True) -> T:
         """Columns are immutable, so a deep copy produces a copy of the
@@ -482,6 +468,7 @@ def view(self, dtype: Dtype) -> ColumnBase:
                     + f" total bytes into {dtype} with size {dtype.itemsize}"
                 )
 
+            # This assertion prevents mypy errors below.
             assert self.base_data is not None
             new_buf_ptr = (
                 self.base_data.ptr + self.offset * self.dtype.itemsize
@@ -642,8 +629,7 @@ def fillna(
         )
 
     def isnull(self) -> ColumnBase:
-        """Identify missing values in a Column.
-        """
+        """Identify missing values in a Column."""
         result = libcudf.unary.is_null(self)
 
         if self.dtype.kind == "f":
@@ -653,14 +639,8 @@ def isnull(self) -> ColumnBase:
 
         return result
 
-    def isna(self) -> ColumnBase:
-        """Identify missing values in a Column. Alias for isnull.
-        """
-        return self.isnull()
-
     def notnull(self) -> ColumnBase:
-        """Identify non-missing values in a Column.
-        """
+        """Identify non-missing values in a Column."""
         result = libcudf.unary.is_valid(self)
 
         if self.dtype.kind == "f":
@@ -670,11 +650,6 @@ def notnull(self) -> ColumnBase:
 
         return result
 
-    def notna(self) -> ColumnBase:
-        """Identify non-missing values in a Column. Alias for notnull.
-        """
-        return self.notnull()
-
     def find_first_value(
         self, value: ScalarLike, closest: bool = False
     ) -> int:
@@ -719,8 +694,7 @@ def take(
         keep_index: bool = True,
         nullify: bool = False,
     ) -> T:
-        """Return Column by taking values from the corresponding *indices*.
-        """
+        """Return Column by taking values from the corresponding *indices*."""
         # Handle zero size
         if indices.size == 0:
             return cast(T, column_empty_like(self, newsize=0))
@@ -763,9 +737,7 @@ def isin(self, values: Sequence) -> ColumnBase:
             # typecasting fails
             return full(len(self), False, dtype="bool")
 
-        res = lhs._obtain_isin_result(rhs)
-
-        return res
+        return lhs._obtain_isin_result(rhs)
 
     def _process_values_for_isin(
         self, values: Sequence
@@ -788,7 +760,7 @@ def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
         """
         if self.dtype != rhs.dtype:
             if self.null_count and rhs.null_count:
-                return self.isna()
+                return self.isnull()
             else:
                 return cudf.core.column.full(len(self), False, dtype="bool")
         elif self.null_count == 0 and (rhs.null_count == len(rhs)):
@@ -807,8 +779,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         )
         res = ldf.merge(rdf, on="x", how="left").sort_values(by="orig_order")
         res = res.drop_duplicates(subset="orig_order", ignore_index=True)
-        res = res._data["bool"].fillna(False)
-        return res
+        return res._data["bool"].fillna(False)
 
     def as_mask(self) -> Buffer:
         """Convert booleans to bitmask
@@ -823,20 +794,10 @@ def as_mask(self) -> Buffer:
 
         return bools_to_mask(self)
 
-    @ioutils.doc_to_dlpack()
-    def to_dlpack(self):
-        """{docstring}"""
-
-        return cudf.io.dlpack.to_dlpack(self)
-
     @property
     def is_unique(self) -> bool:
         return self.distinct_count() == len(self)
 
-    @property
-    def is_monotonic(self) -> bool:
-        return self.is_monotonic_increasing
-
     @property
     def is_monotonic_increasing(self) -> bool:
         return not self.has_nulls and self.as_frame()._is_sorted(
@@ -900,7 +861,13 @@ def distinct_count(
         if method != "sort":
             msg = "non sort based distinct_count() not implemented yet"
             raise NotImplementedError(msg)
-        return cpp_distinct_count(self, ignore_nulls=dropna)
+        try:
+            return self._distinct_count[dropna]
+        except KeyError:
+            self._distinct_count[dropna] = cpp_distinct_count(
+                self, ignore_nulls=dropna
+            )
+            return self._distinct_count[dropna]
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
@@ -957,7 +924,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
 
         # Re-label self w.r.t. the provided categories
         if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)):
-            labels = sr.label_encoding(cats=dtype.categories)
+            labels = sr._label_encoding(cats=dtype.categories)
             if "ordered" in kwargs:
                 warnings.warn(
                     "Ignoring the `ordered` parameter passed in `**kwargs`, "
@@ -973,7 +940,9 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
 
         cats = sr.unique().astype(sr.dtype)
         label_dtype = min_unsigned_type(len(cats))
-        labels = sr.label_encoding(cats=cats, dtype=label_dtype, na_sentinel=1)
+        labels = sr._label_encoding(
+            cats=cats, dtype=label_dtype, na_sentinel=1
+        )
 
         # columns include null index in factorization; remove:
         if self.has_nulls:
@@ -1032,19 +1001,17 @@ def as_decimal32_column(
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask, dtype="bool")
-        result = (
+        return (
             self.as_frame()._apply_boolean_mask(boolean_mask=mask)._as_column()
         )
-        return result
 
     def argsort(
         self, ascending: bool = True, na_position: builtins.str = "last"
     ) -> ColumnBase:
 
-        sorted_indices = self.as_frame()._get_sorted_inds(
+        return self.as_frame()._get_sorted_inds(
             ascending=ascending, na_position=na_position
         )
-        return sorted_indices
 
     def __arrow_array__(self, type=None):
         raise TypeError(
@@ -1128,6 +1095,12 @@ def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
+        # TODO: We could avoid performing `drop_duplicates` for
+        # columns with values that already are unique.
+        # Few things to note before we can do this optimization is
+        # the following issue resolved:
+        # https://github.com/rapidsai/cudf/issues/5286
+
         return (
             self.as_frame()
             .drop_duplicates(keep="first", ignore_index=True)
@@ -1186,15 +1159,13 @@ def min(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("min", result_col, dtype=dtype)
-        else:
-            return result_col
+        return result_col
 
     def max(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
             return libcudf.reduce.reduce("max", result_col, dtype=dtype)
-        else:
-            return result_col
+        return result_col
 
     def sum(
         self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
@@ -1233,11 +1204,11 @@ def corr(self, other: ColumnBase):
         )
 
     def nans_to_nulls(self: T) -> T:
-        if self.dtype.kind == "f":
-            newmask = libcudf.transform.nans_to_nulls(self)
-            return self.set_mask(newmask)
-        else:
+        # Only floats can contain nan.
+        if self.dtype.kind != "f":
             return self
+        newmask = libcudf.transform.nans_to_nulls(self)
+        return self.set_mask(newmask)
 
     def _process_for_reduction(
         self, skipna: bool = None, min_count: int = 0
@@ -1281,8 +1252,7 @@ def column_empty_like(
     masked: bool = False,
     newsize: int = None,
 ) -> ColumnBase:
-    """Allocate a new column like the given *column*
-    """
+    """Allocate a new column like the given *column*"""
     if dtype is None:
         dtype = column.dtype
     row_count = len(column) if newsize is None else newsize
@@ -1324,8 +1294,7 @@ def column_empty_like_same_mask(
 def column_empty(
     row_count: int, dtype: Dtype = "object", masked: bool = False
 ) -> ColumnBase:
-    """Allocate a new column like the given row_count and dtype.
-    """
+    """Allocate a new column like the given row_count and dtype."""
     dtype = cudf.dtype(dtype)
     children = ()  # type: Tuple[ColumnBase, ...]
 
@@ -1739,12 +1708,6 @@ def as_column(
         if dtype is not None:
             data = data.astype(dtype)
 
-    elif type(arbitrary) is Buffer:
-        if dtype is None:
-            raise TypeError("dtype cannot be None if 'arbitrary' is a Buffer")
-
-        data = build_column(arbitrary, dtype=dtype)
-
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
         current_dtype = np.dtype(desc["typestr"])
@@ -1915,9 +1878,7 @@ def as_column(
             buffer = Buffer(arbitrary.view("|u1"))
             mask = None
             if nan_as_null is None or nan_as_null is True:
-                data = as_column(
-                    buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
-                )
+                data = build_column(buffer, dtype=arbitrary.dtype)
                 data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
@@ -1935,9 +1896,7 @@ def as_column(
             buffer = Buffer(arbitrary.view("|u1"))
             mask = None
             if nan_as_null is None or nan_as_null is True:
-                data = as_column(
-                    buffer, dtype=arbitrary.dtype, nan_as_null=nan_as_null
-                )
+                data = build_column(buffer, dtype=arbitrary.dtype)
                 data = data._make_copy_with_na_as_null()
                 mask = data.mask
 
@@ -2025,7 +1984,7 @@ def as_column(
         cudf_dtype = arbitrary._data.dtype
 
         data = Buffer(arbitrary._data.view("|u1"))
-        data = as_column(data, dtype=cudf_dtype)
+        data = build_column(data, dtype=cudf_dtype)
 
         mask = arbitrary._mask
         mask = bools_to_mask(as_column(mask).unary_operator("not"))
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index eba6764e83d..68379002e6b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -347,8 +347,7 @@ def as_string_column(
             )
 
     def _default_na_value(self) -> DatetimeLikeScalar:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         return np.datetime64("nat", self.time_unit)
 
     def mean(self, skipna=None, dtype=np.float64) -> ScalarLike:
@@ -494,7 +493,7 @@ def _make_copy_with_na_as_null(self):
         na_value = np.datetime64("nat", self.time_unit)
         out_col = cudf._lib.replace.replace(
             self,
-            as_column(
+            column.build_column(
                 Buffer(np.array([na_value], dtype=self.dtype).view("|u1")),
                 dtype=self.dtype,
             ),
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b13ad8664dc..6409a9f9196 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -25,8 +25,7 @@
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
-    """Base column for decimal64 and decimal32 columns
-    """
+    """Base column for decimal64 and decimal32 columns"""
 
     dtype: Union[Decimal32Dtype, Decimal64Dtype]
 
@@ -321,5 +320,5 @@ def _binop_precision(l_dtype, r_dtype, op):
         result = p1 + p2 + 1
     else:
         raise NotImplementedError()
-
+    # TODO
     return min(result, cudf.Decimal64Dtype.MAX_PRECISION)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index effeb957238..da51ce3becc 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2021, NVIDIA CORPORATION.
 
 import pickle
-from typing import Sequence
+from typing import List, Sequence
 
 import numpy as np
 import pyarrow as pa
@@ -17,6 +17,7 @@
     extract_element,
     sort_lists,
 )
+from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._typing import BinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_list_dtype
 from cudf.core.buffer import Buffer
@@ -317,6 +318,39 @@ def from_sequences(
         )
         return res
 
+    def as_string_column(
+        self, dtype: Dtype, format=None, **kwargs
+    ) -> "cudf.core.column.StringColumn":
+        """
+        Create a strings column from a list column
+        """
+        # Convert the leaf child column to strings column
+        cc: List[ListColumn] = []
+        c: ColumnBase = self
+        while isinstance(c, ListColumn):
+            cc.insert(0, c)
+            c = c.children[1]
+        s = c.as_string_column(dtype)
+
+        # Rebuild the list column replacing just the leaf child
+        lc = s
+        for c in cc:
+            o = c.children[0]
+            lc = cudf.core.column.ListColumn(  # type: ignore
+                size=c.size,
+                dtype=cudf.ListDtype(lc.dtype),
+                mask=c.mask,
+                offset=c.offset,
+                null_count=c.null_count,
+                children=(o, lc),
+            )
+
+        # Separator strings to match the Python format
+        separators = as_column([", ", "[", "]"])
+
+        # Call libcudf to format the list column
+        return format_list_column(lc, separators)
+
 
 class ListMethods(ColumnMethods):
     """
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 27ff5da5505..becb303feeb 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from types import SimpleNamespace
-from typing import Any, Mapping, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast
 
 import cupy
 import numpy as np
@@ -21,7 +21,7 @@
     column,
     string,
 )
-from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype
+from cudf.core.dtypes import CategoricalDtype, Decimal32Dtype, Decimal64Dtype
 from cudf.utils import cudautils, utils
 from cudf.utils.dtypes import (
     NUMERIC_TYPES,
@@ -61,8 +61,7 @@ def __init__(
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
-            size = data.size // dtype.itemsize
-            size = size - offset
+            size = (data.size // dtype.itemsize) - offset
 
         super().__init__(
             data,
@@ -118,8 +117,12 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
 
         return output
 
-    def unary_operator(self, unaryop: str) -> ColumnBase:
-        return _numeric_column_unaryop(self, op=unaryop)
+    def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
+        if callable(unaryop):
+            return libcudf.transform.transform(self, unaryop)
+
+        unaryop = libcudf.unary.UnaryOp[unaryop.upper()]
+        return libcudf.unary.unary_operation(self, unaryop)
 
     def binary_operator(
         self, binop: str, rhs: BinaryOperand, reflect: bool = False,
@@ -144,6 +147,7 @@ def binary_operator(
                         NumericalColumn,
                         cudf.Scalar,
                         cudf.core.column.Decimal64Column,
+                        cudf.core.column.Decimal32Column,
                     ),
                 )
                 or np.isscalar(rhs)
@@ -155,6 +159,11 @@ def binary_operator(
                     Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0)
                 )
                 return lhs.binary_operator(binop, rhs)
+            elif isinstance(rhs, cudf.core.column.Decimal32Column):
+                lhs = self.as_decimal_column(
+                    Decimal32Dtype(Decimal32Dtype.MAX_PRECISION, 0)
+                )
+                return lhs.binary_operator(binop, rhs)
             out_dtype = np.result_type(self.dtype, rhs.dtype)
             if binop in ["mod", "floordiv"]:
                 tmp = self if reflect else rhs
@@ -203,8 +212,7 @@ def normalize_binop_value(
             if self.dtype.kind == "b":
                 other_dtype = min_signed_type(other)
             if np.isscalar(other):
-                other = cudf.dtype(other_dtype).type(other)
-                return other
+                return cudf.dtype(other_dtype).type(other)
             else:
                 ary = utils.scalar_broadcast_to(
                     other, size=len(self), dtype=other_dtype
@@ -289,8 +297,7 @@ def _process_values_for_isin(
         return lhs, rhs
 
     def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         dkind = self.dtype.kind
         if dkind == "f":
             return self.dtype.type(np.nan)
@@ -352,7 +359,7 @@ def find_and_replace(
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
             replaced = replaced.fillna(
-                df._data["new"][df._data["old"].isna()][0]
+                df._data["new"][df._data["old"].isnull()][0]
             )
             df = df.dropna(subset=["old"])
 
@@ -370,10 +377,7 @@ def fillna(
         """
         Fill null values with *fill_value*
         """
-        if fill_nan:
-            col = self.nans_to_nulls()
-        else:
-            col = self
+        col = self.nans_to_nulls() if fill_nan else self
 
         if col.null_count == 0:
             return col
@@ -401,73 +405,72 @@ def fillna(
             fill_value = cudf.Scalar(fill_value_casted)
         else:
             fill_value = column.as_column(fill_value, nan_as_null=False)
-            # cast safely to the same dtype as self
             if is_integer_dtype(col.dtype):
-                fill_value = _safe_cast_to_int(fill_value, col.dtype)
+                # cast safely to the same dtype as self
+                if fill_value.dtype != col.dtype:
+                    new_fill_value = fill_value.astype(col.dtype)
+                    if not (new_fill_value == fill_value).all():
+                        raise TypeError(
+                            f"Cannot safely cast non-equivalent "
+                            f"{col.dtype.type.__name__} to "
+                            f"{cudf.dtype(dtype).type.__name__}"
+                        )
+                    fill_value = new_fill_value
             else:
                 fill_value = fill_value.astype(col.dtype)
 
         return super(NumericalColumn, col).fillna(fill_value, method)
 
-    def find_first_value(
-        self, value: ScalarLike, closest: bool = False
+    def _find_value(
+        self, value: ScalarLike, closest: bool, find: Callable, compare: str
     ) -> int:
-        """
-        Returns offset of first value that matches. For monotonic
-        columns, returns the offset of the first larger value
-        if closest=True.
-        """
         value = to_cudf_compatible_scalar(value)
         if not is_number(value):
             raise ValueError("Expected a numeric value")
         found = 0
         if len(self):
-            found = cudautils.find_first(
-                self.data_array_view, value, mask=self.mask
-            )
-        if found == -1 and self.is_monotonic and closest:
-            if value < self.min():
-                found = 0
-            elif value > self.max():
-                found = len(self)
-            else:
-                found = cudautils.find_first(
-                    self.data_array_view, value, mask=self.mask, compare="gt",
+            found = find(self.data_array_view, value, mask=self.mask,)
+        if found == -1:
+            if self.is_monotonic_increasing and closest:
+                found = find(
+                    self.data_array_view,
+                    value,
+                    mask=self.mask,
+                    compare=compare,
                 )
                 if found == -1:
                     raise ValueError("value not found")
-        elif found == -1:
-            raise ValueError("value not found")
+            else:
+                raise ValueError("value not found")
         return found
 
+    def find_first_value(
+        self, value: ScalarLike, closest: bool = False
+    ) -> int:
+        """
+        Returns offset of first value that matches. For monotonic
+        columns, returns the offset of the first larger value
+        if closest=True.
+        """
+        if self.is_monotonic_increasing and closest:
+            if value < self.min():
+                return 0
+            elif value > self.max():
+                return len(self)
+        return self._find_value(value, closest, cudautils.find_first, "gt")
+
     def find_last_value(self, value: ScalarLike, closest: bool = False) -> int:
         """
         Returns offset of last value that matches. For monotonic
         columns, returns the offset of the last smaller value
         if closest=True.
         """
-        value = to_cudf_compatible_scalar(value)
-        if not is_number(value):
-            raise ValueError("Expected a numeric value")
-        found = 0
-        if len(self):
-            found = cudautils.find_last(
-                self.data_array_view, value, mask=self.mask,
-            )
-        if found == -1 and self.is_monotonic and closest:
+        if self.is_monotonic_increasing and closest:
             if value < self.min():
-                found = -1
+                return -1
             elif value > self.max():
-                found = len(self) - 1
-            else:
-                found = cudautils.find_last(
-                    self.data_array_view, value, mask=self.mask, compare="lt",
-                )
-                if found == -1:
-                    raise ValueError("value not found")
-        elif found == -1:
-            raise ValueError("value not found")
-        return found
+                return len(self) - 1
+        return self._find_value(value, closest, cudautils.find_last, "lt")
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
@@ -505,34 +508,23 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
                     # Column contains only infs
                     return True
 
-                max_ = col.max()
-                if (min_ >= lower_) and (max_ < upper_):
-                    return True
-                else:
-                    return False
+                return (min_ >= lower_) and (col.max() < upper_)
 
         # want to cast int to uint
         elif self.dtype.kind == "i" and to_dtype.kind == "u":
             i_max_ = np.iinfo(self.dtype).max
             u_max_ = np.iinfo(to_dtype).max
 
-            if self.min() >= 0:
-                if i_max_ <= u_max_:
-                    return True
-                if self.max() < u_max_:
-                    return True
-            return False
+            return (self.min() >= 0) and (
+                (i_max_ <= u_max_) or (self.max() < u_max_)
+            )
 
         # want to cast uint to int
         elif self.dtype.kind == "u" and to_dtype.kind == "i":
             u_max_ = np.iinfo(self.dtype).max
             i_max_ = np.iinfo(to_dtype).max
 
-            if u_max_ <= i_max_:
-                return True
-            if self.max() < i_max_:
-                return True
-            return False
+            return (u_max_ <= i_max_) or (self.max() < i_max_)
 
         # want to cast int to float
         elif self.dtype.kind in {"i", "u"} and to_dtype.kind == "f":
@@ -545,13 +537,10 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             else:
 
                 filled = self.fillna(0)
-                if (
+                return (
                     cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
                     == cudf.Series(filled)
-                ).all():
-                    return True
-                else:
-                    return False
+                ).all()
 
         # want to cast float to int:
         elif self.dtype.kind == "f" and to_dtype.kind in {"i", "u"}:
@@ -561,10 +550,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             # best we can do is hope to catch it here and avoid compare
             if (self.min() >= min_) and (self.max() <= max_):
                 filled = self.fillna(0, fill_nan=False)
-                if (cudf.Series(filled) % 1 == 0).all():
-                    return True
-                else:
-                    return False
+                return (cudf.Series(filled) % 1 == 0).all()
             else:
                 return False
 
@@ -574,7 +560,7 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=as_column(self.base_data, dtype=self.dtype),
+                codes=build_column(self.base_data, dtype=self.dtype),
                 mask=self.base_mask,
                 ordered=dtype.ordered,
                 size=self.size,
@@ -602,33 +588,6 @@ def to_pandas(
         return pd_series
 
 
-def _numeric_column_unaryop(operand: ColumnBase, op: str) -> ColumnBase:
-    if callable(op):
-        return libcudf.transform.transform(operand, op)
-
-    op = libcudf.unary.UnaryOp[op.upper()]
-    return libcudf.unary.unary_operation(operand, op)
-
-
-def _safe_cast_to_int(col: ColumnBase, dtype: DtypeObj) -> ColumnBase:
-    """
-    Cast given NumericalColumn to given integer dtype safely.
-    """
-    assert is_integer_dtype(dtype)
-
-    if col.dtype == dtype:
-        return col
-
-    new_col = col.astype(dtype)
-    if (new_col == col).all():
-        return new_col
-    else:
-        raise TypeError(
-            f"Cannot safely cast non-equivalent "
-            f"{col.dtype.type.__name__} to {cudf.dtype(dtype).type.__name__}"
-        )
-
-
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list]
 ) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index c26b8b7e09c..853fb360c50 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -194,8 +194,7 @@ def corr(self, other: ColumnBase) -> float:
     def round(
         self, decimals: int = 0, how: str = "half_even"
     ) -> NumericalBaseColumn:
-        """Round the values in the Column to the given number of decimals.
-        """
+        """Round the values in the Column to the given number of decimals."""
         return libcudf.round.round(self, decimal_places=decimals, how=how)
 
     def _apply_scan_op(self, op: str) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 476709a76f8..30d762ad5fc 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4,6 +4,7 @@
 
 import builtins
 import pickle
+import re
 import warnings
 from typing import (
     TYPE_CHECKING,
@@ -42,7 +43,7 @@
 
 
 def str_to_boolean(column: StringColumn):
-    """Takes in string column and returns boolean column """
+    """Takes in string column and returns boolean column"""
     return (
         libstrings.count_characters(column) > cudf.Scalar(0, dtype="int8")
     ).fillna(False)
@@ -96,6 +97,76 @@ def str_to_boolean(column: StringColumn):
     cudf.dtype("timedelta64[ns]"): str_cast.int2timedelta,
 }
 
+_NAN_INF_VARIATIONS = [
+    "nan",
+    "NAN",
+    "Nan",
+    "naN",
+    "nAN",
+    "NAn",
+    "nAn",
+    "-inf",
+    "-INF",
+    "-InF",
+    "-inF",
+    "-iNF",
+    "-INf",
+    "-iNf",
+    "+inf",
+    "+INF",
+    "+InF",
+    "+inF",
+    "+iNF",
+    "+INf",
+    "+Inf",
+    "+iNf",
+    "inf",
+    "INF",
+    "InF",
+    "inF",
+    "iNF",
+    "INf",
+    "iNf",
+]
+_LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS = [
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "NaN",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "-Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+    "Inf",
+]
+
+
+def _is_supported_regex_flags(flags):
+    return flags == 0 or (
+        (flags & (re.MULTILINE | re.DOTALL) != 0)
+        and (flags & ~(re.MULTILINE | re.DOTALL) == 0)
+    )
+
 
 class StringMethods(ColumnMethods):
     """
@@ -191,7 +262,9 @@ def len(self) -> SeriesOrIndex:
         """
         Computes the length of each element in the Series/Index.
 
-        Returns : Series or Index of int
+        Returns
+        -------
+        Series or Index of int
             A Series or Index of integer values
             indicating the length of each element in the Series or Index.
 
@@ -215,7 +288,9 @@ def byte_count(self) -> SeriesOrIndex:
         """
         Computes the number of bytes of each string in the Series/Index.
 
-        Returns : Series or Index of int
+        Returns
+        -------
+        Series or Index of int
             A Series or Index of integer values
             indicating the number of bytes of each strings in the
             Series or Index.
@@ -540,7 +615,7 @@ def _split_by_character(self):
 
         offset_col = self._column.children[0]
 
-        res = cudf.core.column.ListColumn(
+        return cudf.core.column.ListColumn(
             size=len(self._column),
             dtype=cudf.ListDtype(self._column.dtype),
             mask=self._column.mask,
@@ -548,12 +623,11 @@ def _split_by_character(self):
             null_count=self._column.null_count,
             children=(offset_col, result_col),
         )
-        return res
 
     def extract(
         self, pat: str, flags: int = 0, expand: bool = True
     ) -> SeriesOrIndex:
-        """
+        r"""
         Extract capture groups in the regex `pat` as columns in a DataFrame.
 
         For each subject string in the Series, extract groups from the first
@@ -625,7 +699,7 @@ def contains(
         na=np.nan,
         regex: bool = True,
     ) -> SeriesOrIndex:
-        """
+        r"""
         Test if pattern or regex is contained within a string of a Series or
         Index.
 
@@ -638,6 +712,8 @@ def contains(
             Character sequence or regular expression.
             If ``pat`` is list-like then regular expressions are not
             accepted.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
         regex : bool, default True
             If True, assumes the pattern is a regular expression.
             If False, treats the pattern as a literal string.
@@ -651,9 +727,11 @@ def contains(
 
         Notes
         -----
-        The parameters `case`, `flags`, and `na` are not yet supported and
-        will raise a NotImplementedError if anything other than the default
+        The parameters `case` and `na` are not yet supported and will
+        raise a NotImplementedError if anything other than the default
         value is set.
+        The `flags` parameter currently only supports re.DOTALL and
+        re.MULTILINE.
 
         Examples
         --------
@@ -731,18 +809,21 @@ def contains(
         """  # noqa W605
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
-        elif flags != 0:
-            raise NotImplementedError("`flags` parameter is not yet supported")
-        elif na is not np.nan:
+        if na is not np.nan:
             raise NotImplementedError("`na` parameter is not yet supported")
+        if regex and isinstance(pat, re.Pattern):
+            flags = pat.flags & ~re.U
+            pat = pat.pattern
+        if not _is_supported_regex_flags(flags):
+            raise ValueError("invalid `flags` parameter value")
 
         if pat is None:
             result_col = column.column_empty(
                 len(self._column), dtype="bool", masked=True
             )
         elif is_scalar(pat):
-            if regex is True:
-                result_col = libstrings.contains_re(self._column, pat)
+            if regex:
+                result_col = libstrings.contains_re(self._column, pat, flags)
             else:
                 result_col = libstrings.contains(
                     self._column, cudf.Scalar(pat, "str")
@@ -904,6 +985,10 @@ def replace(
         if n == 0:
             n = -1
 
+        # If 'pat' is re.Pattern then get the pattern string from it
+        if regex and isinstance(pat, re.Pattern):
+            pat = pat.pattern
+
         # Pandas forces non-regex replace when pat is a single-character
         return self._return_or_inplace(
             libstrings.replace_re(
@@ -925,7 +1010,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
 
         Parameters
         ----------
-        pat : str
+        pat : str or compiled regex
             Regex with groupings to identify extract sections.
             This should not be a compiled regex.
         repl : str
@@ -944,6 +1029,11 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
         1    ZV576
         dtype: object
         """
+
+        # If 'pat' is re.Pattern then get the pattern string from it
+        if isinstance(pat, re.Pattern):
+            pat = pat.pattern
+
         return self._return_or_inplace(
             libstrings.replace_with_backrefs(self._column, pat, repl)
         )
@@ -1026,7 +1116,9 @@ def isinteger(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned for
         that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1086,7 +1178,9 @@ def ishex(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned for
         that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1123,7 +1217,9 @@ def istimestamp(self, format: str) -> SeriesOrIndex:
         Check whether all characters in each string can be converted to
         a timestamp using the given format.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1149,7 +1245,9 @@ def isfloat(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned for
         that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1216,7 +1314,9 @@ def isdecimal(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned for
         that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1278,7 +1378,9 @@ def isalnum(self) -> SeriesOrIndex:
 
         Equivalent to: ``isalpha() or isdigit() or isnumeric() or isdecimal()``
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the
             same length as the original Series/Index.
 
@@ -1345,7 +1447,9 @@ def isalpha(self) -> SeriesOrIndex:
         for each element of the Series/Index.
         If a string has zero characters, False is returned for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same length
             as the original Series/Index.
 
@@ -1402,7 +1506,9 @@ def isdigit(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned
         for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1463,7 +1569,9 @@ def isnumeric(self) -> SeriesOrIndex:
         for each element of the Series/Index. If a
         string has zero characters, False is returned for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1532,7 +1640,9 @@ def isupper(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned
         for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1589,7 +1699,9 @@ def islower(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned
         for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1642,7 +1754,9 @@ def isipv4(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned for
         that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same
             length as the original Series/Index.
 
@@ -1666,7 +1780,9 @@ def lower(self) -> SeriesOrIndex:
         Equivalent to `str.lower()
         <https://docs.python.org/3/library/stdtypes.html#str.lower>`_.
 
-        Returns : Series or Index of object
+        Returns
+        -------
+        Series or Index of object
             A copy of the object with all strings converted to lowercase.
 
         See also
@@ -1706,7 +1822,9 @@ def upper(self) -> SeriesOrIndex:
         Equivalent to `str.upper()
         <https://docs.python.org/3/library/stdtypes.html#str.upper>`_.
 
-        Returns : Series or Index of object
+        Returns
+        -------
+        Series or Index of object
 
         See also
         --------
@@ -1783,7 +1901,9 @@ def swapcase(self) -> SeriesOrIndex:
         Equivalent to `str.swapcase()
         <https://docs.python.org/3/library/stdtypes.html#str.swapcase>`_.
 
-        Returns : Series or Index of object
+        Returns
+        -------
+        Series or Index of object
 
         See also
         --------
@@ -1829,7 +1949,9 @@ def title(self) -> SeriesOrIndex:
         Equivalent to `str.title()
         <https://docs.python.org/3/library/stdtypes.html#str.title>`_.
 
-        Returns : Series or Index of object
+        Returns
+        -------
+        Series or Index of object
 
         See also
         --------
@@ -1865,6 +1987,32 @@ def title(self) -> SeriesOrIndex:
         """
         return self._return_or_inplace(libstrings.title(self._column))
 
+    def istitle(self) -> SeriesOrIndex:
+        """
+        Check whether each string is title formatted.
+        The first letter of each word should be uppercase and the rest
+        should be lowercase.
+
+        Equivalent to :meth:`str.istitle`.
+
+        Returns
+        -------
+        Series or Index of object
+
+        Examples
+        --------
+        >>> import cudf
+        >>> data = ['leopard', 'Golden Eagle', 'SNAKE', ''])
+        >>> s = cudf.Series(data)
+        >>> s.str.istitle()
+        0    False
+        1     True
+        2    False
+        3    False
+        dtype: bool
+        """
+        return self._return_or_inplace(libstrings.is_title(self._column))
+
     def filter_alphanum(
         self, repl: str = None, keep: bool = True
     ) -> SeriesOrIndex:
@@ -2391,10 +2539,10 @@ def rsplit(
         The handling of the n keyword depends on the number of
         found splits:
 
-            - If found splits > n, make first n splits only
-            - If found splits <= n, make all splits
-            - If for a certain row the number of found splits < n,
-              append None for padding up to n if ``expand=True``.
+        - If found splits > n, make first n splits only
+        - If found splits <= n, make all splits
+        - If for a certain row the number of found splits < n,
+          append None for padding up to n if ``expand=True``.
 
         If using ``expand=True``, Series and Index callers return
         DataFrame and MultiIndex objects, respectively.
@@ -3247,7 +3395,7 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.wrap(self._column, width))
 
     def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
-        """
+        r"""
         Count occurrences of pattern in each string of the Series/Index.
 
         This function is used to count the number of times a particular
@@ -3255,8 +3403,10 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
 
         Parameters
         ----------
-        pat : str
+        pat : str or compiled regex
             Valid regular expression.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
 
         Returns
         -------
@@ -3264,9 +3414,10 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
 
         Notes
         -----
-            -  `flags` parameter is currently not supported.
+            -  `flags` parameter currently only supports re.DOTALL
+               and re.MULTILINE.
             -  Some characters need to be escaped when passing
-               in pat. eg. ``'$'`` has a special meaning in regex
+               in pat. e.g. ``'$'`` has a special meaning in regex
                and must be escaped when finding this literal character.
 
         Examples
@@ -3301,10 +3452,15 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         >>> index.str.count('a')
         Int64Index([0, 0, 2, 1], dtype='int64')
         """  # noqa W605
-        if flags != 0:
-            raise NotImplementedError("`flags` parameter is not yet supported")
+        if isinstance(pat, re.Pattern):
+            flags = pat.flags & ~re.U
+            pat = pat.pattern
+        if not _is_supported_regex_flags(flags):
+            raise ValueError("invalid `flags` parameter value")
 
-        return self._return_or_inplace(libstrings.count_re(self._column, pat))
+        return self._return_or_inplace(
+            libstrings.count_re(self._column, pat, flags)
+        )
 
     def findall(
         self, pat: str, flags: int = 0, expand: bool = True
@@ -3381,7 +3537,9 @@ def isempty(self) -> SeriesOrIndex:
         """
         Check whether each string is an empty string.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same length as
             the original Series/Index.
 
@@ -3410,7 +3568,9 @@ def isspace(self) -> SeriesOrIndex:
         If a string has zero characters, False is returned
         for that check.
 
-        Returns : Series or Index of bool
+        Returns
+        -------
+        Series or Index of bool
             Series or Index of boolean values with the same length as
             the original Series/Index.
 
@@ -3821,8 +3981,10 @@ def match(
 
         Parameters
         ----------
-        pat : str
+        pat : str or compiled regex
             Character sequence or regular expression.
+        flags : int, default 0 (no flags)
+            Flags to pass through to the regex engine (e.g. re.MULTILINE)
 
         Returns
         -------
@@ -3830,7 +3992,10 @@ def match(
 
         Notes
         -----
-        Parameters currently not supported are: `case`, `flags` and `na`.
+        Parameters `case` and `na` are currently not supported.
+        The `flags` parameter currently only supports re.DOTALL and
+        re.MULTILINE.
+
 
         Examples
         --------
@@ -3855,10 +4020,15 @@ def match(
         """
         if case is not True:
             raise NotImplementedError("`case` parameter is not yet supported")
-        if flags != 0:
-            raise NotImplementedError("`flags` parameter is not yet supported")
+        if isinstance(pat, re.Pattern):
+            flags = pat.flags & ~re.U
+            pat = pat.pattern
+        if not _is_supported_regex_flags(flags):
+            raise ValueError("invalid `flags` parameter value")
 
-        return self._return_or_inplace(libstrings.match_re(self._column, pat))
+        return self._return_or_inplace(
+            libstrings.match_re(self._column, pat, flags)
+        )
 
     def url_decode(self) -> SeriesOrIndex:
         """
@@ -4060,7 +4230,7 @@ def filter_characters(
         )
 
     def normalize_spaces(self) -> SeriesOrIndex:
-        """
+        r"""
         Remove extra whitespace between tokens and trim whitespace
         from the beginning and the end of each string.
 
@@ -4349,7 +4519,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex:
             retain_index=False,
         )
 
-    def character_ngrams(self, n: int = 2) -> SeriesOrIndex:
+    def character_ngrams(
+        self, n: int = 2, as_list: bool = False
+    ) -> SeriesOrIndex:
         """
         Generate the n-grams from characters in a column of strings.
 
@@ -4358,6 +4530,9 @@ def character_ngrams(self, n: int = 2) -> SeriesOrIndex:
         n : int
             The degree of the n-gram (number of consecutive characters).
             Default of 2 for bigrams.
+        as_list : bool
+            Set to True to return ngrams in a list column where each
+            list element is the ngrams for each string.
 
         Examples
         --------
@@ -4380,11 +4555,32 @@ def character_ngrams(self, n: int = 2) -> SeriesOrIndex:
         3    fgh
         4    xyz
         dtype: object
+        >>> str_series.str.character_ngrams(3,True)
+        0    [abc, bcd]
+        1    [efg, fgh]
+        2         [xyz]
+        dtype: list
         """
-        return self._return_or_inplace(
-            libstrings.generate_character_ngrams(self._column, n),
-            retain_index=False,
+        ngrams = libstrings.generate_character_ngrams(self._column, n)
+        if as_list is False:
+            return self._return_or_inplace(ngrams, retain_index=False)
+
+        # convert the output to a list by just generating the
+        # offsets for the output list column
+        sn = (self.len() - (n - 1)).clip(0, None).fillna(0)  # type: ignore
+        sizes = libcudf.concat.concat_columns(
+            [column.as_column(0, dtype=np.int32, length=1), sn._column]
+        )
+        oc = libcudf.reduce.scan("cumsum", sizes, True)
+        lc = cudf.core.column.ListColumn(
+            size=self._column.size,
+            dtype=cudf.ListDtype(self._column.dtype),
+            mask=self._column.mask,
+            offset=0,
+            null_count=self._column.null_count,
+            children=(oc, ngrams),
         )
+        return self._return_or_inplace(lc, retain_index=False)
 
     def ngrams_tokenize(
         self, n: int = 2, delimiter: str = " ", separator: str = "_"
@@ -4651,7 +4847,7 @@ def subword_tokenize(
         Examples
         --------
         >>> import cudf
-        >>> from cudf.utils.hash_vocab_utils  import hash_vocab
+        >>> from cudf.utils.hash_vocab_utils import hash_vocab
         >>> hash_vocab('bert-base-uncased-vocab.txt', 'voc_hash.txt')
         >>> ser = cudf.Series(['this is the', 'best book'])
         >>> stride, max_length = 8, 8
@@ -4744,7 +4940,7 @@ def is_consonant(self, position) -> SeriesOrIndex:
         0     True
         1    False
         dtype: bool
-         """
+        """
         ltype = libstrings.LetterType.CONSONANT
 
         if can_convert_to_column(position):
@@ -5094,15 +5290,7 @@ def set_base_data(self, value):
                 "StringColumns do not use data attribute of Column, use "
                 "`set_base_children` instead"
             )
-        else:
-            super().set_base_data(value)
-
-    def set_base_mask(self, value: Optional[Buffer]):
-        super().set_base_mask(value)
-
-    def set_base_children(self, value: Tuple["column.ColumnBase", ...]):
-        # TODO: Implement dtype validation of the children here somehow
-        super().set_base_children(value)
+        super().set_base_data(value)
 
     def __contains__(self, item: ScalarLike) -> bool:
         if is_scalar(item):
@@ -5118,21 +5306,31 @@ def as_numerical_column(
         self, dtype: Dtype, **kwargs
     ) -> "cudf.core.column.NumericalColumn":
         out_dtype = cudf.dtype(dtype)
-
+        string_col = self
         if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(self).all():
+            if not libstrings.is_integer(string_col).all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(self).all():
+            # TODO: Replace this `replace` call with a
+            # case-insensitive method once following
+            # issue is fixed: https://github.com/rapidsai/cudf/issues/5217
+            old_values = cudf.core.column.as_column(_NAN_INF_VARIATIONS)
+            new_values = cudf.core.column.as_column(
+                _LIBCUDF_SUPPORTED_NAN_INF_VARIATIONS
+            )
+            string_col = libcudf.replace.replace(
+                string_col, old_values, new_values
+            )
+            if not libstrings.is_float(string_col).all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
                 )
 
-        result_col = _str_to_numeric_typecast_functions[out_dtype](self)
+        result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
         return result_col
 
     def _as_datetime_or_timedelta_column(self, dtype, format):
@@ -5175,7 +5373,7 @@ def as_datetime_column(
                 )
             else:
                 format = datetime.infer_format(
-                    self.apply_boolean_mask(self.notna()).element_indexing(0)
+                    self.apply_boolean_mask(self.notnull()).element_indexing(0)
                 )
 
         return self._as_datetime_or_timedelta_column(out_dtype, format)
@@ -5350,7 +5548,7 @@ def find_and_replace(
         df = cudf.DataFrame({"old": to_replace_col, "new": replacement_col})
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
-            res = self.fillna(df._data["new"][df._data["old"].isna()][0])
+            res = self.fillna(df._data["new"][df._data["old"].isnull()][0])
             df = df.dropna(subset=["old"])
         else:
             res = self
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 7167918d14d..f0d02a706e2 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import pandas as pd
 import pyarrow as pa
 
 import cudf
@@ -80,6 +81,16 @@ def to_arrow(self):
             pa_type, len(self), buffers, children=children
         )
 
+    def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
+        # We cannot go via Arrow's `to_pandas` because of the following issue:
+        # https://issues.apache.org/jira/browse/ARROW-12680
+
+        pd_series = pd.Series(self.to_arrow().tolist(), dtype="object")
+
+        if index is not None:
+            pd_series.index = index
+        return pd_series
+
     def __getitem__(self, args):
         result = super().__getitem__(args)
         if isinstance(result, dict):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c7b13903751..4b7a3bcc197 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -305,8 +305,7 @@ def as_numerical(self) -> "cudf.core.column.NumericalColumn":
         )
 
     def _default_na_value(self) -> ScalarLike:
-        """Returns the default NA value for this column
-        """
+        """Returns the default NA value for this column"""
         return np.timedelta64("nat", self.time_unit)
 
     @property
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 56882f89af8..2411b2a9211 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -244,9 +244,7 @@ def _clear_cache(self):
             del self._column_length
 
     def to_pandas_index(self) -> pd.Index:
-        """"
-        Convert the keys of the ColumnAccessor to a Pandas Index object.
-        """
+        """Convert the keys of the ColumnAccessor to a Pandas Index object."""
         if self.multiindex and len(self.level_names) > 0:
             # Using `from_frame()` instead of `from_tuples`
             # prevents coercion of values to a different type
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6d41d90ab47..2849536dcdb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -45,6 +45,7 @@
 from cudf.core.column import (
     as_column,
     build_categorical_column,
+    build_column,
     column_empty,
     concat_columns,
 )
@@ -52,7 +53,12 @@
 from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import DataFrameGroupBy
 from cudf.core.index import BaseIndex, RangeIndex, as_index
-from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
+from cudf.core.indexed_frame import (
+    IndexedFrame,
+    _FrameIndexer,
+    _get_label_range_or_mask,
+    _indices_from_labels,
+)
 from cudf.core.series import Series
 from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
 from cudf.utils.docutils import copy_docstring
@@ -81,7 +87,336 @@
 }
 
 
-class DataFrame(Frame, Serializable, GetAttrGetItemMixin):
+class _DataFrameIndexer(_FrameIndexer):
+    def __getitem__(self, arg):
+        from cudf import MultiIndex
+
+        if isinstance(self._frame.index, MultiIndex) or isinstance(
+            self._frame.columns, MultiIndex
+        ):
+            # This try/except block allows the use of pandas-like
+            # tuple arguments into MultiIndex dataframes.
+            try:
+                return self._getitem_tuple_arg(arg)
+            except (TypeError, KeyError, IndexError, ValueError):
+                return self._getitem_tuple_arg((arg, slice(None)))
+        else:
+            if not isinstance(arg, tuple):
+                arg = (arg, slice(None))
+            return self._getitem_tuple_arg(arg)
+
+    def __setitem__(self, key, value):
+        if not isinstance(key, tuple):
+            key = (key, slice(None))
+        return self._setitem_tuple_arg(key, value)
+
+    def _can_downcast_to_series(self, df, arg):
+        """
+        This method encapsulates the logic used
+        to determine whether or not the result of a loc/iloc
+        operation should be "downcasted" from a DataFrame to a
+        Series
+        """
+        from cudf.core.column import as_column
+
+        if isinstance(df, cudf.Series):
+            return False
+        nrows, ncols = df.shape
+        if nrows == 1:
+            if type(arg[0]) is slice:
+                if not is_scalar(arg[1]):
+                    return False
+            elif (is_list_like(arg[0]) or is_column_like(arg[0])) and (
+                is_list_like(arg[1])
+                or is_column_like(arg[0])
+                or type(arg[1]) is slice
+            ):
+                return False
+            else:
+                if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance(
+                    arg[1], slice
+                ):
+                    return True
+            dtypes = df.dtypes.values.tolist()
+            all_numeric = all([is_numeric_dtype(t) for t in dtypes])
+            if all_numeric:
+                return True
+        if ncols == 1:
+            if type(arg[1]) is slice:
+                return False
+            if isinstance(arg[1], tuple):
+                # Multiindex indexing with a slice
+                if any(isinstance(v, slice) for v in arg):
+                    return False
+            if not (is_list_like(arg[1]) or is_column_like(arg[1])):
+                return True
+        return False
+
+    def _downcast_to_series(self, df, arg):
+        """
+        "Downcast" from a DataFrame to a Series
+        based on Pandas indexing rules
+        """
+        nrows, ncols = df.shape
+        # determine the axis along which the Series is taken:
+        if nrows == 1 and ncols == 1:
+            if is_scalar(arg[0]) and is_scalar(arg[1]):
+                return df[df.columns[0]].iloc[0]
+            elif not is_scalar(arg[0]):
+                axis = 1
+            else:
+                axis = 0
+
+        elif nrows == 1:
+            axis = 0
+        elif ncols == 1:
+            axis = 1
+        else:
+            raise ValueError("Cannot downcast DataFrame selection to Series")
+
+        # take series along the axis:
+        if axis == 1:
+            return df[df._data.names[0]]
+        else:
+            if df._num_columns > 0:
+                dtypes = df.dtypes.values.tolist()
+                normalized_dtype = np.result_type(*dtypes)
+                for name, col in df._data.items():
+                    df[name] = col.astype(normalized_dtype)
+
+            sr = df.T
+            return sr[sr._data.names[0]]
+
+
+class _DataFrameLocIndexer(_DataFrameIndexer):
+    """
+    For selection by label.
+    """
+
+    def _getitem_scalar(self, arg):
+        return self._frame[arg[1]].loc[arg[0]]
+
+    @annotate("LOC_GETITEM", color="blue", domain="cudf_python")
+    def _getitem_tuple_arg(self, arg):
+        from uuid import uuid4
+
+        from cudf import MultiIndex
+        from cudf.core.column import column
+        from cudf.core.dataframe import DataFrame
+        from cudf.core.index import as_index
+
+        # Step 1: Gather columns
+        if isinstance(arg, tuple):
+            columns_df = self._frame._get_columns_by_label(arg[1])
+            columns_df._index = self._frame._index
+        else:
+            columns_df = self._frame
+
+        # Step 2: Gather rows
+        if isinstance(columns_df.index, MultiIndex):
+            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
+                if isinstance(arg, pd.MultiIndex):
+                    arg = MultiIndex.from_pandas(arg)
+
+                indices = _indices_from_labels(columns_df, arg)
+                return columns_df.take(indices)
+
+            else:
+                if isinstance(arg, tuple):
+                    return columns_df.index._get_row_major(columns_df, arg[0])
+                else:
+                    return columns_df.index._get_row_major(columns_df, arg)
+        else:
+            if isinstance(arg[0], slice):
+                out = _get_label_range_or_mask(
+                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
+                )
+                if isinstance(out, slice):
+                    df = columns_df._slice(out)
+                else:
+                    df = columns_df._apply_boolean_mask(out)
+            else:
+                tmp_arg = arg
+                if is_scalar(arg[0]):
+                    # If a scalar, there is possibility of having duplicates.
+                    # Join would get all the duplicates. So, coverting it to
+                    # an array kind.
+                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
+                if len(tmp_arg[0]) == 0:
+                    return columns_df._empty_like(keep_index=True)
+                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
+
+                if is_bool_dtype(tmp_arg[0]):
+                    df = columns_df._apply_boolean_mask(tmp_arg[0])
+                else:
+                    tmp_col_name = str(uuid4())
+                    other_df = DataFrame(
+                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
+                        index=as_index(tmp_arg[0]),
+                    )
+                    df = other_df.join(columns_df, how="inner")
+                    # as join is not assigning any names to index,
+                    # update it over here
+                    df.index.name = columns_df.index.name
+                    df = df.sort_values(tmp_col_name)
+                    df.drop(columns=[tmp_col_name], inplace=True)
+                    # There were no indices found
+                    if len(df) == 0:
+                        raise KeyError(arg)
+
+        # Step 3: Gather index
+        if df.shape[0] == 1:  # we have a single row
+            if isinstance(arg[0], slice):
+                start = arg[0].start
+                if start is None:
+                    start = self._frame.index[0]
+                df.index = as_index(start)
+            else:
+                row_selection = column.as_column(arg[0])
+                if is_bool_dtype(row_selection.dtype):
+                    df.index = self._frame.index.take(row_selection)
+                else:
+                    df.index = as_index(row_selection)
+        # Step 4: Downcast
+        if self._can_downcast_to_series(df, arg):
+            return self._downcast_to_series(df, arg)
+        return df
+
+    @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
+    def _setitem_tuple_arg(self, key, value):
+        if isinstance(self._frame.index, cudf.MultiIndex) or isinstance(
+            self._frame.columns, pd.MultiIndex
+        ):
+            raise NotImplementedError(
+                "Setting values using df.loc[] not supported on "
+                "DataFrames with a MultiIndex"
+            )
+
+        try:
+            columns_df = self._frame._get_columns_by_label(key[1])
+        except KeyError:
+            if not self._frame.empty and isinstance(key[0], slice):
+                pos_range = _get_label_range_or_mask(
+                    self._frame.index, key[0].start, key[0].stop, key[0].step
+                )
+                idx = self._frame.index[pos_range]
+            elif self._frame.empty and isinstance(key[0], slice):
+                idx = None
+            else:
+                idx = cudf.Index(key[0])
+            if is_scalar(value):
+                length = len(idx) if idx is not None else 1
+                value = as_column(value, length=length)
+
+            new_col = cudf.Series(value, index=idx)
+            if not self._frame.empty:
+                new_col = new_col._align_to_index(
+                    self._frame.index, how="right"
+                )
+
+            if self._frame.empty:
+                self._frame.index = (
+                    idx if idx is not None else cudf.RangeIndex(len(new_col))
+                )
+            self._frame._data.insert(key[1], new_col)
+        else:
+            if isinstance(value, (cupy.ndarray, np.ndarray)):
+                value_df = cudf.DataFrame(value)
+                if value_df.shape[1] != columns_df.shape[1]:
+                    if value_df.shape[1] == 1:
+                        value_cols = (
+                            value_df._data.columns * columns_df.shape[1]
+                        )
+                    else:
+                        raise ValueError(
+                            f"shape mismatch: value array of shape "
+                            f"{value_df.shape} could not be "
+                            f"broadcast to indexing result of shape "
+                            f"{columns_df.shape}"
+                        )
+                else:
+                    value_cols = value_df._data.columns
+                for i, col in enumerate(columns_df._column_names):
+                    self._frame[col].loc[key[0]] = value_cols[i]
+            else:
+                for col in columns_df._column_names:
+                    self._frame[col].loc[key[0]] = value
+
+
+class _DataFrameIlocIndexer(_DataFrameIndexer):
+    """
+    For selection by index.
+    """
+
+    @annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
+    def _getitem_tuple_arg(self, arg):
+        from cudf import MultiIndex
+        from cudf.core.column import column
+        from cudf.core.index import as_index
+
+        # Iloc Step 1:
+        # Gather the columns specified by the second tuple arg
+        columns_df = cudf.DataFrame(self._frame._get_columns_by_index(arg[1]))
+
+        columns_df._index = self._frame._index
+
+        # Iloc Step 2:
+        # Gather the rows specified by the first tuple arg
+        if isinstance(columns_df.index, MultiIndex):
+            if isinstance(arg[0], slice):
+                df = columns_df[arg[0]]
+            else:
+                df = columns_df.index._get_row_major(columns_df, arg[0])
+            if (len(df) == 1 and len(columns_df) >= 1) and not (
+                isinstance(arg[0], slice) or isinstance(arg[1], slice)
+            ):
+                # Pandas returns a numpy scalar in this case
+                return df.iloc[0]
+            if self._can_downcast_to_series(df, arg):
+                return self._downcast_to_series(df, arg)
+            return df
+        else:
+            if isinstance(arg[0], slice):
+                df = columns_df._slice(arg[0])
+            elif is_scalar(arg[0]):
+                index = arg[0]
+                if index < 0:
+                    index += len(columns_df)
+                df = columns_df._slice(slice(index, index + 1, 1))
+            else:
+                arg = (column.as_column(arg[0]), arg[1])
+                if is_bool_dtype(arg[0]):
+                    df = columns_df._apply_boolean_mask(arg[0])
+                else:
+                    df = columns_df._gather(arg[0])
+
+        # Iloc Step 3:
+        # Reindex
+        if df.shape[0] == 1:  # we have a single row without an index
+            df.index = as_index(self._frame.index[arg[0]])
+
+        # Iloc Step 4:
+        # Downcast
+        if self._can_downcast_to_series(df, arg):
+            return self._downcast_to_series(df, arg)
+
+        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
+            df._index = as_index(self._frame.index[arg[0]])
+        return df
+
+    @annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
+    def _setitem_tuple_arg(self, key, value):
+        columns = cudf.DataFrame(self._frame._get_columns_by_index(key[1]))
+
+        for col in columns:
+            self._frame[col].iloc[key[0]] = value
+
+    def _getitem_scalar(self, arg):
+        col = self._frame.columns[arg[1]]
+        return self._frame[col].iloc[arg[0]]
+
+
+class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
 
@@ -172,8 +507,10 @@ class DataFrame(Frame, Serializable, GetAttrGetItemMixin):
     3  3   0.3
     """
 
-    _PROTECTED_KEYS = frozenset(("_column_accessor", "_data", "_index"))
+    _PROTECTED_KEYS = frozenset(("_data", "_index"))
     _accessors: Set[Any] = set()
+    _loc_indexer_type = _DataFrameLocIndexer
+    _iloc_indexer_type = _DataFrameIlocIndexer
 
     @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
     def __init__(self, data=None, index=None, columns=None, dtype=None):
@@ -580,14 +917,17 @@ def dtypes(self):
         string              object
         dtype: object
         """
-        return cudf.utils.utils._create_pandas_series(
-            data=[x.dtype for x in self._data.columns], index=self._data.names,
+        return pd.Series(self._dtypes)
+
+    @property
+    def _dtypes(self):
+        return dict(
+            zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
     @property
     def ndim(self):
-        """Dimension of the data. DataFrame ndim is always 2.
-        """
+        """Dimension of the data. DataFrame ndim is always 2."""
         return 2
 
     def __dir__(self):
@@ -642,7 +982,7 @@ def __getitem__(self, arg):
         2  2  2  2
         3  3  3  3
         >>> df[-5:]  # get last 5 rows of all columns
-            a   b   c
+             a   b   c
         15  15  15  15
         16  16  16  16
         17  17  17  17
@@ -686,8 +1026,7 @@ def __getitem__(self, arg):
 
     @annotate("DATAFRAME_SETITEM", color="blue", domain="cudf_python")
     def __setitem__(self, arg, value):
-        """Add/set column by *arg or DataFrame*
-        """
+        """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
             # not handling set_item where arg = df & value = df
             if isinstance(value, DataFrame):
@@ -806,13 +1145,13 @@ def __sizeof__(self):
 
     def _slice(self: T, arg: slice) -> T:
         """
-       _slice : slice the frame as per the arg
+        _slice : slice the frame as per the arg
 
-       Parameters
-       ----------
-       arg : should always be of type slice
+        Parameters
+        ----------
+        arg : should always be of type slice
 
-       """
+        """
         from cudf.core.index import RangeIndex
 
         num_rows = len(self)
@@ -897,7 +1236,7 @@ def memory_usage(self, index=True, deep=False):
         ...              for t in dtypes])
         >>> df = cudf.DataFrame(data)
         >>> df.head()
-            int64  float64  object  bool
+           int64  float64  object  bool
         0      1      1.0     1.0  True
         1      1      1.0     1.0  True
         2      1      1.0     1.0  True
@@ -969,7 +1308,7 @@ def __array_function__(self, func, types, args, kwargs):
             return NotImplemented
 
     def _get_numeric_data(self):
-        """ Return a dataframe with only numeric data types """
+        """Return a dataframe with only numeric data types"""
         columns = [
             c
             for c, dt in self.dtypes.items()
@@ -1275,7 +1614,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
         b      int64
         dtype: object
         >>> df.astype({'a': 'float32'})
-            a  b
+              a  b
         0  10.0  1
         1  20.0  2
         2  30.0  3
@@ -1543,6 +1882,7 @@ def _binaryop(
         fn: str,
         fill_value: Any = None,
         reflect: bool = False,
+        can_reindex: bool = False,
         *args,
         **kwargs,
     ):
@@ -1563,14 +1903,17 @@ def _binaryop(
                 for right, (name, left) in zip(rhs, lhs._data.items())
             }
         elif isinstance(rhs, DataFrame):
-            if fn in cudf.utils.utils._EQUALITY_OPS:
-                if not lhs.columns.equals(rhs.columns) or not lhs.index.equals(
-                    rhs.index
-                ):
-                    raise ValueError(
-                        "Can only compare identically-labeled "
-                        "DataFrame objects"
-                    )
+            if (
+                not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
+                and (
+                    not lhs.columns.equals(rhs.columns)
+                    or not lhs.index.equals(rhs.index)
+                )
+            ):
+                raise ValueError(
+                    "Can only compare identically-labeled " "DataFrame objects"
+                )
 
             lhs, rhs = _align_indices(lhs, rhs)
 
@@ -1623,60 +1966,6 @@ def _binaryop(
             index=lhs._index,
         )
 
-    def add(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Addition of dataframe and other, element-wise (binary
-        operator `add`).
-
-        Equivalent to ``dataframe + other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `radd`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df + 1
-                angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        >>> df.add(1)
-                angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "add", fill_value)
-
     def update(
         self,
         other,
@@ -1704,978 +1993,91 @@ def update(
 
         overwrite : {True, False}, default True
             How to handle non-NA values for overlapping keys:
-            True: overwrite original DataFrame's values with values from other.
-            False: only update values that are NA in the original DataFrame.
-
-        filter_func : None
-            filter_func is not supported yet
-            Return True for values that should be updated.S
-
-        errors : {'raise', 'ignore'}, default 'ignore'
-            If 'raise', will raise a ValueError if the DataFrame and other
-            both contain non-NA data in the same place.
-
-
-        Returns
-        -------
-        None : method directly changes calling object
-
-        Raises
-        -------
-        ValueError
-            - When ``errors`` = 'raise' and there's overlapping non-NA data.
-            - When ``errors`` is not either 'ignore' or 'raise'
-
-        NotImplementedError
-            - If ``join`` != 'left'
-        """
-        # TODO: Support other joins
-        if join != "left":
-            raise NotImplementedError("Only left join is supported")
-        if errors not in {"ignore", "raise"}:
-            raise ValueError(
-                "The parameter errors must be either 'ignore' or 'raise'"
-            )
-        if filter_func is not None:
-            raise NotImplementedError("filter_func is not supported yet")
-
-        if not isinstance(other, DataFrame):
-            other = DataFrame(other)
-
-        if not self.columns.equals(other.columns):
-            other = other.reindex(self.columns, axis=1)
-        if not self.index.equals(other.index):
-            other = other.reindex(self.index, axis=0)
-
-        source_df = self.copy(deep=False)
-        for col in source_df._column_names:
-            this = source_df[col]
-            that = other[col]
-
-            if errors == "raise":
-                mask_this = that.notna()
-                mask_that = this.notna()
-                if (mask_this & mask_that).any():
-                    raise ValueError("Data overlaps.")
-
-            if overwrite:
-                mask = that.isna()
-            else:
-                mask = this.notna()
-
-            # don't overwrite columns unnecessarily
-            if mask.all():
-                continue
-            source_df[col] = source_df[col].where(mask, that)
-
-        self._mimic_inplace(source_df, inplace=True)
-
-    def radd(self, other, axis=1, level=None, fill_value=None):
-        """
-        Get Addition of dataframe and other, element-wise (binary
-        operator `radd`).
-
-        Equivalent to ``other + dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `add`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df + 1
-                angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        >>> df.radd(1)
-                angles  degrees
-        circle          1      361
-        triangle        4      181
-        rectangle       5      361
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "add", fill_value, reflect=True)
-
-    def sub(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Subtraction of dataframe and other, element-wise (binary
-        operator `sub`).
-
-        Equivalent to ``dataframe - other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rsub`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.sub(1)
-                angles  degrees
-        circle         -1      359
-        triangle        2      179
-        rectangle       3      359
-        >>> df.sub([1, 2])
-                angles  degrees
-        circle         -1      358
-        triangle        2      178
-        rectangle       3      358
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "sub", fill_value)
-
-    def rsub(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Subtraction of dataframe and other, element-wise (binary
-        operator `rsub`).
-
-        Equivalent to ``other - dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `sub`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df
-                   angles  degrees
-        circle          0      360
-        triangle        3      180
-        rectangle       4      360
-        >>> df.rsub(1)
-                   angles  degrees
-        circle          1     -359
-        triangle       -2     -179
-        rectangle      -3     -359
-        >>> df.rsub([1, 2])
-                   angles  degrees
-        circle          1     -358
-        triangle       -2     -178
-        rectangle      -3     -358
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "sub", fill_value, reflect=True)
-
-    def mul(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Multiplication of dataframe and other, element-wise (binary
-        operator `mul`).
-
-        Equivalent to ``dataframe * other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rmul`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
-        ...                      index=['circle', 'triangle', 'rectangle'])
-        >>> df * other
-                   angles degrees
-        circle          0    <NA>
-        triangle        9    <NA>
-        rectangle      16    <NA>
-        >>> df.mul(other, fill_value=0)
-                angles  degrees
-        circle          0        0
-        triangle        9        0
-        rectangle      16        0
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "mul", fill_value)
-
-    def rmul(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Multiplication of dataframe and other, element-wise (binary
-        operator `rmul`).
-
-        Equivalent to ``other * dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `mul`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
-        ...                      index=['circle', 'triangle', 'rectangle'])
-        >>> other * df
-                   angles degrees
-        circle          0    <NA>
-        triangle        9    <NA>
-        rectangle      16    <NA>
-        >>> df.rmul(other, fill_value=0)
-                   angles  degrees
-        circle          0        0
-        triangle        9        0
-        rectangle      16        0
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "mul", fill_value, reflect=True)
-
-    def mod(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Modulo division of dataframe and other, element-wise (binary
-        operator `mod`).
-
-        Equivalent to ``dataframe % other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rmod`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df % 100
-                   angles  degrees
-        circle          0       60
-        triangle        3       80
-        rectangle       4       60
-        >>> df.mod(100)
-                   angles  degrees
-        circle          0       60
-        triangle        3       80
-        rectangle       4       60
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "mod", fill_value)
-
-    def rmod(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Modulo division of dataframe and other, element-wise (binary
-        operator `rmod`).
-
-        Equivalent to ``other % dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `mod`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> 100 % df
-                   angles  degrees
-        circle          0      100
-        triangle        1      100
-        rectangle       0      100
-        >>> df.rmod(100)
-                   angles  degrees
-        circle          0      100
-        triangle        1      100
-        rectangle       0      100
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "mod", fill_value, reflect=True)
-
-    def pow(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Exponential power of dataframe and other, element-wise (binary
-        operator `pow`).
-
-        Equivalent to ``dataframe ** other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rpow`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df ** 2
-                   angles  degrees
-        circle          0   129600
-        triangle        9    32400
-        rectangle      16   129600
-        >>> df.pow(2)
-                   angles  degrees
-        circle          0   129600
-        triangle        9    32400
-        rectangle      16   129600
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "pow", fill_value)
-
-    def rpow(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Exponential power of dataframe and other, element-wise (binary
-        operator `pow`).
-
-        Equivalent to ``other ** dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `pow`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> 1 ** df
-                   angles  degrees
-        circle          1        1
-        triangle        1        1
-        rectangle       1        1
-        >>> df.rpow(1)
-                   angles  degrees
-        circle          1        1
-        triangle        1        1
-        rectangle       1        1
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "pow", fill_value, reflect=True)
-
-    def floordiv(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Integer division of dataframe and other, element-wise (binary
-        operator `floordiv`).
-
-        Equivalent to ``dataframe // other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rfloordiv`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.floordiv(2)
-                   angles  degrees
-        circle          0      180
-        triangle        1       90
-        rectangle       2      180
-        >>> df // 2
-                   angles  degrees
-        circle          0      180
-        triangle        1       90
-        rectangle       2      180
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "floordiv", fill_value)
-
-    def rfloordiv(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Integer division of dataframe and other, element-wise (binary
-        operator `rfloordiv`).
-
-        Equivalent to ``other // dataframe``, but with support to substitute
-        a fill_value for missing data in one of the inputs. With reverse
-        version, `floordiv`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'col1': [10, 11, 23],
-        ... 'col2': [101, 122, 321]})
-        >>> df
-           col1  col2
-        0    10   101
-        1    11   122
-        2    23   321
-        >>> df.rfloordiv(df)
-           col1  col2
-        0     1     1
-        1     1     1
-        2     1     1
-        >>> df.rfloordiv(200)
-           col1  col2
-        0    20     1
-        1    18     1
-        2     8     0
-        >>> df.rfloordiv(100)
-           col1  col2
-        0    10     0
-        1     9     0
-        2     4     0
-        """
-
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "floordiv", fill_value, reflect=True)
-
-    def truediv(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Floating division of dataframe and other, element-wise (binary
-        operator `truediv`).
-
-        Equivalent to ``dataframe / other``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `rtruediv`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df.truediv(10)
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-        >>> df.div(10)
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-        >>> df / 10
-                   angles  degrees
-        circle        0.0     36.0
-        triangle      0.3     18.0
-        rectangle     0.4     36.0
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "truediv", fill_value)
-
-    # Alias for truediv
-    div = truediv
-
-    def rtruediv(self, other, axis="columns", level=None, fill_value=None):
-        """
-        Get Floating division of dataframe and other, element-wise (binary
-        operator `rtruediv`).
-
-        Equivalent to ``other / dataframe``, but with support to substitute a
-        fill_value for missing data in one of the inputs. With reverse
-        version, `truediv`.
-
-        Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`,
-        `dot`) to arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`,
-        `@`.
-
-        Parameters
-        ----------
-
-        other : scalar, sequence, Series, or DataFrame
-            Any single or multiple element data structure, or list-like object.
-        fill_value  : float or None, default None
-            Fill existing missing (NaN) values, and any new element needed
-            for successful DataFrame alignment, with this value before
-            computation. If data in both corresponding DataFrame locations
-            is missing the result will be missing.
-
-        Returns
-        -------
-        DataFrame
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
-        ...                    'degrees': [360, 180, 360]},
-        ...                   index=['circle', 'triangle', 'rectangle'])
-        >>> df
-                   angles  degrees
-        circle          0      360
-        triangle        3      180
-        rectangle       4      360
-        >>> df.rtruediv(10)
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-        >>> df.rdiv(10)
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-        >>> 10 / df
-                     angles   degrees
-        circle          inf  0.027778
-        triangle   3.333333  0.055556
-        rectangle  2.500000  0.027778
-        """
-        if axis not in (1, "columns"):
-            raise NotImplementedError("Only axis=1 supported at this time.")
-
-        if level is not None:
-            raise NotImplementedError("level parameter is not supported yet.")
-
-        return self._binaryop(other, "truediv", fill_value, reflect=True)
-
-    # Alias for rtruediv
-    rdiv = rtruediv
-
-    def __iter__(self):
-        return iter(self.columns)
-
-    def iteritems(self):
-        """ Iterate over column names and series pairs """
-        for k in self:
-            yield (k, self[k])
-
-    @property  # type: ignore
-    @annotate("DATAFRAME_LOC", color="blue", domain="cudf_python")
-    def loc(self):
-        """
-        Selecting rows and columns by label or boolean mask.
-
-        Examples
-        --------
-
-        DataFrame with string index.
-
-        >>> df
-           a  b
-        a  0  5
-        b  1  6
-        c  2  7
-        d  3  8
-        e  4  9
-
-        Select a single row by label.
-
-        >>> df.loc['a']
-        a    0
-        b    5
-        Name: a, dtype: int64
-
-        Select multiple rows and a single column.
-
-        >>> df.loc[['a', 'c', 'e'], 'b']
-        a    5
-        c    7
-        e    9
-        Name: b, dtype: int64
-
-        Selection by boolean mask.
-
-        >>> df.loc[df.a > 2]
-           a  b
-        d  3  8
-        e  4  9
-
-        Setting values using loc.
-
-        >>> df.loc[['a', 'c', 'e'], 'a'] = 0
-        >>> df
-           a  b
-        a  0  5
-        b  1  6
-        c  0  7
-        d  3  8
-        e  0  9
-
-        See also
-        --------
-        DataFrame.iloc
-
-        Notes
-        -----
-        One notable difference from Pandas is when DataFrame is of
-        mixed types and result is expected to be a Series in case of Pandas.
-        cuDF will return a DataFrame as it doesn't support mixed types
-        under Series yet.
-
-        Mixed dtype single row output as a dataframe (pandas results in Series)
-
-        >>> import cudf
-        >>> df = cudf.DataFrame({"a":[1, 2, 3], "b":["a", "b", "c"]})
-        >>> df.loc[0]
-           a  b
-        0  1  a
-        """
-        return _DataFrameLocIndexer(self)
+            True: overwrite original DataFrame's values with values from other.
+            False: only update values that are NA in the original DataFrame.
 
-    @property
-    def iloc(self):
-        """
-        Selecting rows and column by position.
+        filter_func : None
+            filter_func is not supported yet
+            Return True for values that should be updated.S
 
-        Examples
-        --------
-        >>> df = cudf.DataFrame({'a': range(20),
-        ...                      'b': range(20),
-        ...                      'c': range(20)})
+        errors : {'raise', 'ignore'}, default 'ignore'
+            If 'raise', will raise a ValueError if the DataFrame and other
+            both contain non-NA data in the same place.
 
-        Select a single row using an integer index.
 
-        >>> df.iloc[1]
-        a    1
-        b    1
-        c    1
-        Name: 1, dtype: int64
+        Returns
+        -------
+        None : method directly changes calling object
 
-        Select multiple rows using a list of integers.
+        Raises
+        -------
+        ValueError
+            - When ``errors`` = 'raise' and there's overlapping non-NA data.
+            - When ``errors`` is not either 'ignore' or 'raise'
 
-        >>> df.iloc[[0, 2, 9, 18]]
-              a    b    c
-         0    0    0    0
-         2    2    2    2
-         9    9    9    9
-        18   18   18   18
+        NotImplementedError
+            - If ``join`` != 'left'
+        """
+        # TODO: Support other joins
+        if join != "left":
+            raise NotImplementedError("Only left join is supported")
+        if errors not in {"ignore", "raise"}:
+            raise ValueError(
+                "The parameter errors must be either 'ignore' or 'raise'"
+            )
+        if filter_func is not None:
+            raise NotImplementedError("filter_func is not supported yet")
 
-        Select rows using a slice.
+        if not isinstance(other, DataFrame):
+            other = DataFrame(other)
 
-        >>> df.iloc[3:10:2]
-             a    b    c
-        3    3    3    3
-        5    5    5    5
-        7    7    7    7
-        9    9    9    9
+        if not self.columns.equals(other.columns):
+            other = other.reindex(self.columns, axis=1)
+        if not self.index.equals(other.index):
+            other = other.reindex(self.index, axis=0)
 
-        Select both rows and columns.
+        source_df = self.copy(deep=False)
+        for col in source_df._column_names:
+            this = source_df[col]
+            that = other[col]
 
-        >>> df.iloc[[1, 3, 5, 7], 2]
-        1    1
-        3    3
-        5    5
-        7    7
-        Name: c, dtype: int64
+            if errors == "raise":
+                mask_this = that.notna()
+                mask_that = this.notna()
+                if (mask_this & mask_that).any():
+                    raise ValueError("Data overlaps.")
 
-        Setting values in a column using iloc.
+            if overwrite:
+                mask = that.isna()
+            else:
+                mask = this.notna()
 
-        >>> df.iloc[:4] = 0
-        >>> df
-           a  b  c
-        0  0  0  0
-        1  0  0  0
-        2  0  0  0
-        3  0  0  0
-        4  4  4  4
-        5  5  5  5
-        6  6  6  6
-        7  7  7  7
-        8  8  8  8
-        9  9  9  9
-        [10 more rows]
+            # don't overwrite columns unnecessarily
+            if mask.all():
+                continue
+            source_df[col] = source_df[col].where(mask, that)
 
-        See also
-        --------
-        DataFrame.loc
+        self._mimic_inplace(source_df, inplace=True)
 
-        Notes
-        -----
-        One notable difference from Pandas is when DataFrame is of
-        mixed types and result is expected to be a Series in case of Pandas.
-        cuDF will return a DataFrame as it doesn't support mixed types
-        under Series yet.
+    def __iter__(self):
+        return iter(self.columns)
 
-        Mixed dtype single row output as a dataframe (pandas results in Series)
+    def iteritems(self):
+        """Iterate over column names and series pairs"""
+        for k in self:
+            yield (k, self[k])
 
-        >>> import cudf
-        >>> df = cudf.DataFrame({"a":[1, 2, 3], "b":["a", "b", "c"]})
-        >>> df.iloc[0]
-           a  b
-        0  1  a
-        """
-        return _DataFrameIlocIndexer(self)
+    def equals(self, other, **kwargs):
+        ret = super().equals(other)
+        # If all other checks matched, validate names.
+        if ret:
+            for self_name, other_name in zip(
+                self._data.names, other._data.names
+            ):
+                if self_name != other_name:
+                    ret = False
+                    break
+        return ret
 
     @property
     def iat(self):
@@ -2694,8 +2096,7 @@ def at(self):
     @property  # type: ignore
     @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python")
     def columns(self):
-        """Returns a tuple of columns
-        """
+        """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
@@ -2742,40 +2143,6 @@ def _rename_columns(self, new_names):
         mapper = dict(zip(old_cols, new_names))
         self.rename(mapper=mapper, inplace=True, axis=1)
 
-    @property
-    def index(self):
-        """Returns the index of the DataFrame
-        """
-        return self._index
-
-    @index.setter
-    def index(self, value):
-        old_length = (
-            self._num_rows if self._index is None else len(self._index)
-        )
-        if isinstance(value, cudf.core.multiindex.MultiIndex):
-            if len(self._data) > 0 and len(value) != old_length:
-                msg = (
-                    f"Length mismatch: Expected axis has {old_length} "
-                    f"elements, new values have {len(value)} elements"
-                )
-                raise ValueError(msg)
-            self._index = value
-            return
-
-        new_length = len(value)
-
-        if len(self._data) > 0 and new_length != old_length:
-            msg = (
-                f"Length mismatch: Expected axis has {old_length} elements, "
-                f"new values have {new_length} elements"
-            )
-            raise ValueError(msg)
-
-        # try to build an index from generic _index
-        idx = as_index(value)
-        self._index = idx
-
     def _reindex(
         self, columns, dtypes=None, deep=False, index=None, inplace=False
     ):
@@ -2808,16 +2175,13 @@ def _reindex(
         if index is not None:
             index = cudf.core.index.as_index(index)
 
-            if isinstance(index, cudf.MultiIndex):
-                idx_dtype_match = all(
-                    left_dtype == right_dtype
-                    for left_dtype, right_dtype in zip(
-                        (col.dtype for col in df.index._data.columns),
-                        (col.dtype for col in index._data.columns),
-                    )
+            idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
+                left_dtype == right_dtype
+                for left_dtype, right_dtype in zip(
+                    (col.dtype for col in df.index._data.columns),
+                    (col.dtype for col in index._data.columns),
                 )
-            else:
-                idx_dtype_match = df.index.dtype == index.dtype
+            )
 
             if not idx_dtype_match:
                 columns = (
@@ -2858,7 +2222,7 @@ def _reindex(
         return self._mimic_inplace(result, inplace=inplace)
 
     def reindex(
-        self, labels=None, axis=0, index=None, columns=None, copy=True
+        self, labels=None, axis=None, index=None, columns=None, copy=True
     ):
         """
         Return a new DataFrame whose axes conform to a new index
@@ -2907,23 +2271,34 @@ def reindex(
         if labels is None and index is None and columns is None:
             return self.copy(deep=copy)
 
-        dtypes = dict(self.dtypes)
-        idx = labels if index is None and axis in (0, "index") else index
-        cols = (
-            labels if columns is None and axis in (1, "columns") else columns
-        )
+        # pandas simply ignores the labels keyword if it is provided in
+        # addition to index and columns, but it prohibits the axis arg.
+        if (index is not None or columns is not None) and axis is not None:
+            raise TypeError(
+                "Cannot specify both 'axis' and any of 'index' or 'columns'."
+            )
+
+        axis = self._get_axis_from_axis_arg(axis)
+        if axis == 0:
+            if index is None:
+                index = labels
+        else:
+            if columns is None:
+                columns = labels
         df = (
             self
-            if cols is None
-            else self[list(set(self._column_names) & set(cols))]
+            if columns is None
+            else self[list(set(self._column_names) & set(columns))]
         )
 
-        result = df._reindex(
-            columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False
+        return df._reindex(
+            columns=columns,
+            dtypes=self._dtypes,
+            deep=copy,
+            index=index,
+            inplace=False,
         )
 
-        return result
-
     def set_index(
         self,
         keys,
@@ -3194,47 +2569,17 @@ class max_speed
         if not inplace:
             return result
 
-    def take(self, positions, keep_index=True):
-        """
-        Return a new DataFrame containing the rows specified by *positions*
-
-        Parameters
-        ----------
-        positions : array-like
-            Integer or boolean array-like specifying the rows of the output.
-            If integer, each element represents the integer index of a row.
-            If boolean, *positions* must be of the same length as *self*,
-            and represents a boolean mask.
-
-        Returns
-        -------
-        out : DataFrame
-            New DataFrame
-
-        Examples
-        --------
-        >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
-        ...                    'b': cudf.Series(['a', 'b', 'c'])})
-        >>> a.take([0, 2, 2])
-             a  b
-        0  1.0  a
-        2  3.0  c
-        2  3.0  c
-        >>> a.take([True, False, True])
-             a  b
-        0  1.0  a
-        2  3.0  c
-        """
-        positions = as_column(positions)
-        if is_bool_dtype(positions):
-            return self._apply_boolean_mask(positions)
-        out = self._gather(positions, keep_index=keep_index)
+    def take(self, indices, axis=0, keep_index=None):
+        axis = self._get_axis_from_axis_arg(axis)
+        if axis != 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+        out = super().take(indices, keep_index)
         out.columns = self.columns
         return out
 
     @annotate("INSERT", color="green", domain="cudf_python")
     def insert(self, loc, name, value):
-        """ Add a column to DataFrame at the index specified by loc.
+        """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
@@ -3458,8 +2803,7 @@ def drop(
             return out
 
     def _drop_column(self, name):
-        """Drop a column by *name*
-        """
+        """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
@@ -3542,8 +2886,7 @@ def drop_duplicates(
         return self._mimic_inplace(outdf, inplace=inplace)
 
     def pop(self, item):
-        """Return a column and drop it from the DataFrame.
-        """
+        """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
@@ -3697,7 +3040,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
         warnings.warn(
             "The as_gpu_matrix method will be removed in a future cuDF "
             "release. Consider using `to_cupy` instead.",
-            DeprecationWarning,
+            FutureWarning,
         )
         if columns is None:
             columns = self._data.names
@@ -3745,7 +3088,7 @@ def as_matrix(self, columns=None):
         warnings.warn(
             "The as_matrix method will be removed in a future cuDF "
             "release. Consider using `to_numpy` instead.",
-            DeprecationWarning,
+            FutureWarning,
         )
         return self.as_gpu_matrix(columns=columns).copy_to_host()
 
@@ -3799,6 +3142,13 @@ def one_hot_encoding(
         3         4      bird          0          1.0          0.0          0.0
         4         5      fish          2          0.0          0.0          1.0
         """
+
+        warnings.warn(
+            "DataFrame.one_hot_encoding is deprecated and will be removed in "
+            "future, use `get_dummies` instead.",
+            FutureWarning,
+        )
+
         if hasattr(cats, "to_arrow"):
             cats = cats.to_arrow().to_pylist()
         else:
@@ -3836,7 +3186,7 @@ def label_encoding(
 
         Returns
         -------
-        a new dataframe with a new column append for the coded values.
+        A new DataFrame with a new column appended for the coded values.
 
         Examples
         --------
@@ -3854,135 +3204,27 @@ def label_encoding(
         2  3  20             1
         """
 
-        newname = prefix_sep.join([prefix, "labels"])
-        newcol = self[column].label_encoding(
-            cats=cats, dtype=dtype, na_sentinel=na_sentinel
+        warnings.warn(
+            "DataFrame.label_encoding is deprecated and will be removed in "
+            "the future. Consider using cuML's LabelEncoder instead.",
+            FutureWarning,
         )
-        outdf = self.copy()
-        outdf.insert(len(outdf._data), newname, newcol)
 
-        return outdf
-
-    @annotate("ARGSORT", color="yellow", domain="cudf_python")
-    def argsort(self, ascending=True, na_position="last"):
-        """
-        Sort by the values.
-
-        Parameters
-        ----------
-        ascending : bool or list of bool, default True
-            If True, sort values in ascending order, otherwise descending.
-        na_position : {‘first’ or ‘last’}, default ‘last’
-            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
-            at the end.
-
-        Returns
-        -------
-        out_column_inds : cuDF Column of indices sorted based on input
-
-        Notes
-        -----
-        Difference from pandas:
-
-        - Support axis='index' only.
-        - Not supporting: inplace, kind
-        - Ascending can be a list of bools to control per column
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame({'a':[10, 0, 2], 'b':[-10, 10, 1]})
-        >>> df
-            a   b
-        0  10 -10
-        1   0  10
-        2   2   1
-        >>> inds = df.argsort()
-        >>> inds
-        0    1
-        1    2
-        2    0
-        dtype: int32
-        >>> df.take(inds)
-            a   b
-        1   0  10
-        2   2   1
-        0  10 -10
-        """
-        inds_col = self._get_sorted_inds(
-            ascending=ascending, na_position=na_position
+        return self._label_encoding(
+            column, prefix, cats, prefix_sep, dtype, na_sentinel
         )
-        return cudf.Series(inds_col)
 
-    def sort_values(
-        self,
-        by,
-        axis=0,
-        ascending=True,
-        inplace=False,
-        kind="quicksort",
-        na_position="last",
-        ignore_index=False,
+    def _label_encoding(
+        self, column, prefix, cats, prefix_sep="_", dtype=None, na_sentinel=-1
     ):
-        """
-        Sort by the values row-wise.
-
-        Parameters
-        ----------
-        by : str or list of str
-            Name or list of names to sort by.
-        ascending : bool or list of bool, default True
-            Sort ascending vs. descending. Specify list for multiple sort
-            orders. If this is a list of bools, must match the length of the
-            by.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            'first' puts nulls at the beginning, 'last' puts nulls at the end
-        ignore_index : bool, default False
-            If True, index will not be sorted.
-
-        Returns
-        -------
-        sorted_obj : cuDF DataFrame
-
-        Notes
-        -----
-        Difference from pandas:
-          * Support axis='index' only.
-          * Not supporting: inplace, kind
-
-        Examples
-        --------
-        >>> import cudf
-        >>> df = cudf.DataFrame()
-        >>> df['a'] = [0, 1, 2]
-        >>> df['b'] = [-3, 2, 0]
-        >>> df.sort_values('b')
-           a  b
-        0  0 -3
-        2  2  0
-        1  1  2
-        """
-        if inplace:
-            raise NotImplementedError("`inplace` not currently implemented.")
-        if kind not in {"quicksort", "mergesort", "heapsort", "stable"}:
-            raise AttributeError(
-                f"{kind} is not a valid sorting algorithm for "
-                f"'DataFrame' object"
-            )
-        elif kind != "quicksort":
-            msg = (
-                f"GPU-accelerated {kind} is currently not supported, "
-                f"now defaulting to GPU-accelerated quicksort."
-            )
-            warnings.warn(msg)
-        if axis != 0:
-            raise NotImplementedError("`axis` not currently implemented.")
-
-        # argsort the `by` column
-        return self.take(
-            self[by].argsort(ascending=ascending, na_position=na_position),
-            keep_index=not ignore_index,
+        # Private implementation of deprecated public label_encoding method
+        newname = prefix_sep.join([prefix, "labels"])
+        newcol = self[column]._label_encoding(
+            cats=cats, dtype=dtype, na_sentinel=na_sentinel
         )
+        outdf = self.copy()
+        outdf.insert(len(outdf._data), newname, newcol)
+        return outdf
 
     def agg(self, aggs, axis=None):
         """
@@ -4016,7 +3258,7 @@ def agg(self, aggs, axis=None):
         # TODO: Remove the typecasting below once issue #6846 is fixed
         # link <https://github.com/rapidsai/cudf/issues/6846>
         dtypes = [self[col].dtype for col in self._column_names]
-        common_dtype = cudf.utils.dtypes.find_common_type(dtypes)
+        common_dtype = find_common_type(dtypes)
         df_normalized = self.astype(common_dtype)
 
         if any(is_string_dtype(dt) for dt in dtypes):
@@ -4176,7 +3418,7 @@ def nlargest(self, n, columns, keep="first"):
         Italy     59000000  1937894      IT
         Brunei      434000    12128      BN
         """
-        return self._n_largest_or_smallest("nlargest", n, columns, keep)
+        return self._n_largest_or_smallest(True, n, columns, keep)
 
     def nsmallest(self, n, columns, keep="first"):
         """Get the rows of the DataFrame sorted by the n smallest value of *columns*
@@ -4244,26 +3486,7 @@ def nsmallest(self, n, columns, keep="first"):
         Tuvalu         11300   38      TV
         Nauru         337000  182      NR
         """
-        return self._n_largest_or_smallest("nsmallest", n, columns, keep)
-
-    def _n_largest_or_smallest(self, method, n, columns, keep):
-        # Get column to operate on
-        if not isinstance(columns, str):
-            [column] = columns
-        else:
-            column = columns
-
-        col = self[column].reset_index(drop=True)
-        # Operate
-        sorted_series = getattr(col, method)(n=n, keep=keep)
-        df = DataFrame()
-        new_positions = sorted_series.index.gpu_values
-        for k in self._data.names:
-            if k == column:
-                df[k] = sorted_series
-            else:
-                df[k] = self[k].reset_index(drop=True).take(new_positions)
-        return df.set_index(self.index.take(new_positions))
+        return self._n_largest_or_smallest(False, n, columns, keep)
 
     def transpose(self):
         """Transpose index and columns.
@@ -4369,7 +3592,8 @@ def merge(
             If on is None and not merging on indexes then
             this defaults to the intersection of the columns
             in both DataFrames.
-        how : {‘left’, ‘outer’, ‘inner’}, default ‘inner’
+        how : {‘left’, ‘outer’, ‘inner’, 'leftsemi', 'leftanti'}, \
+            default ‘inner’
             Type of merge to be performed.
 
             - left : use only keys from left frame, similar to a SQL left
@@ -4377,8 +3601,14 @@ def merge(
             - right : not supported.
             - outer : use union of keys from both frames, similar to a SQL
               full outer join.
-            - inner: use intersection of keys from both frames, similar to
+            - inner : use intersection of keys from both frames, similar to
               a SQL inner join.
+            - leftsemi : similar to ``inner`` join, but only returns columns
+               from the left dataframe and ignores all columns from the
+               right dataframe.
+            - leftanti : returns only rows columns from the left dataframe
+              for non-matched records. This is exact opposite to ``leftsemi``
+              join.
         left_on : label or list, or array-like
             Column or index level names to join on in the left DataFrame.
             Can also be an array or list of arrays of the length of the
@@ -4706,23 +3936,21 @@ def apply(
         result_type: {'expand', 'reduce', 'broadcast', None}, default None
             Not yet supported
         args: tuple
-            Not yet supported
+            Positional arguments to pass to func in addition to the dataframe.
 
         Examples
         --------
 
         Simple function of a single variable which could be NA
 
-        >>> from cudf.core.udf.pipeline import nulludf
-        >>> @nulludf
-        ... def f(x):
-        ...     if x is cudf.NA:
+        >>> def f(row):
+        ...     if row['a'] is cudf.NA:
         ...             return 0
         ...     else:
-        ...             return x + 1
+        ...             return row['a'] + 1
         ...
         >>> df = cudf.DataFrame({'a': [1, cudf.NA, 3]})
-        >>> df.apply(lambda row: f(row['a']))
+        >>> df.apply(f, axis=1)
         0    2
         1    0
         2    4
@@ -4731,15 +3959,14 @@ def apply(
         Function of multiple variables will operate in
         a null aware manner
 
-        >>> @nulludf
-        ... def f(x, y):
-        ...     return x - y
+        >>> def f(row):
+        ...     return row['a'] - row['b']
         ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, cudf.NA, 3, cudf.NA],
         ...     'b': [5, 6, cudf.NA, cudf.NA]
         ... })
-        >>> df.apply(lambda row: f(row['a'], row['b']))
+        >>> df.apply(f)
         0      -4
         1    <NA>
         2    <NA>
@@ -4748,18 +3975,17 @@ def apply(
 
         Functions may conditionally return NA as in pandas
 
-        >>> @nulludf
-        ... def f(x, y):
-        ...     if x + y > 3:
+        >>> def f(row):
+        ...     if row['a'] + row['b'] > 3:
         ...             return cudf.NA
         ...     else:
-        ...             return x + y
+        ...             return row['a'] + row['b']
         ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 2, 3],
         ...     'b': [2, 1, 1]
         ... })
-        >>> df.apply(lambda row: f(row['a'], row['b']))
+        >>> df.apply(f, axis=1)
         0       3
         1       3
         2    <NA>
@@ -4768,15 +3994,14 @@ def apply(
         Mixed types are allowed, but will return the common
         type, rather than object as in pandas
 
-        >>> @nulludf
-        ... def f(x, y):
-        ...     return x + y
+        >>> def f(row):
+        ...     return row['a'] + row['b']
         ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 2, 3],
         ...     'b': [0.5, cudf.NA, 3.14]
         ... })
-        >>> df.apply(lambda row: f(row['a'], row['b']))
+        >>> df.apply(f, axis=1)
         0     1.5
         1    <NA>
         2    6.14
@@ -4786,17 +4011,16 @@ def apply(
         result will be promoted to a safe type regardless of
         the data
 
-        >>> @nulludf
-        ... def f(x):
-        ...     if x > 3:
-        ...             return x
+        >>> def f(row):
+        ...     if row['a'] > 3:
+        ...             return row['a']
         ...     else:
         ...             return 1.5
         ...
         >>> df = cudf.DataFrame({
         ...     'a': [1, 3, 5]
         ... })
-        >>> df.apply(lambda row: f(row['a']))
+        >>> df.apply(f, axis=1)
         0    1.5
         1    1.5
         2    5.0
@@ -4804,8 +4028,10 @@ def apply(
 
         Ops against N columns are supported generally
 
-        >>> @nulludf
-        ... def f(v, w, x, y, z):
+        >>> def f(row):
+        ...     v, w, x, y, z = (
+        ...         row['a'], row['b'], row['c'], row['d'], row['e']
+        ...     )
         ...     return x + (y - (z / w)) % v
         ...
         >>> df = cudf.DataFrame({
@@ -4815,36 +4041,12 @@ def apply(
         ...     'd': [8, 7, 8],
         ...     'e': [7, 1, 6]
         ... })
-        >>> df.apply(
-        ...     lambda row: f(
-        ...             row['a'],
-        ...             row['b'],
-        ...             row['c'],
-        ...             row['d'],
-        ...             row['e']
-        ...     )
-        ... )
+        >>> df.apply(f, axis=1)
         0    <NA>
         1     4.8
         2     5.0
         dtype: float64
-
-        Notes
-        -----
-        Available only using cuda 11.1+ due to particular required
-        runtime compilation features
         """
-
-        for dtype in self.dtypes:
-            if (
-                isinstance(dtype, cudf.core.dtypes._BaseDtype)
-                or dtype == "object"
-            ):
-                raise TypeError(
-                    "DataFrame.apply currently only "
-                    "supports non decimal numeric types"
-                )
-
         if axis != 1:
             raise ValueError(
                 "DataFrame.apply currently only supports row wise ops"
@@ -4853,10 +4055,10 @@ def apply(
             raise ValueError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
             raise ValueError("The `result_type` kwarg is not yet supported.")
-        if args or kwargs:
-            raise ValueError("args and kwargs are not yet supported.")
+        if kwargs:
+            raise ValueError("UDFs using **kwargs are not yet supported.")
 
-        return cudf.Series(func(self))
+        return self._apply(func, *args)
 
     @applyutils.doc_apply()
     def apply_rows(
@@ -5003,22 +4205,76 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    def hash_columns(self, columns=None):
+    def hash_columns(self, columns=None, method="murmur3"):
         """Hash the given *columns* and return a new device array
 
+        This method is deprecated. Replace ``df.hash_columns(columns, method)``
+        with ``df[columns].hash_values(method)``.
+
         Parameters
         ----------
         columns : sequence of str; optional
             Sequence of column names. If columns is *None* (unspecified),
             all columns in the frame are used.
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            Hash values for each row.
         """
+        warnings.warn(
+            "The `hash_columns` method will be removed in a future cuDF "
+            "release. Replace `df.hash_columns(columns, method)` with "
+            "`df[columns].hash_values(method)`.",
+            FutureWarning,
+        )
         if columns is None:
-            table_to_hash = self
-        else:
-            cols = [self[k]._column for k in columns]
-            table_to_hash = Frame(data=dict(zip(columns, cols)))
+            # Slice by [:] to keep all columns.
+            columns = slice(None, None, None)
+        return self[columns].hash_values(method=method)
+
+    def hash_values(self, method="murmur3"):
+        """Compute the hash of values in each row.
+
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            A Series with hash values.
 
-        return Series(table_to_hash._hash()).values
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({"a": [10, 120, 30], "b": [0.0, 0.25, 0.50]})
+        >>> df
+             a     b
+        0   10  0.00
+        1  120  0.25
+        2   30  0.50
+        >>> df.hash_values(method="murmur3")
+        0    -330519225
+        1    -397962448
+        2   -1345834934
+        dtype: int32
+        >>> df.hash_values(method="md5")
+        0    57ce879751b5169c525907d5c563fae1
+        1    948d6221a7c4963d4be411bcead7e32b
+        2    fe061786ea286a515b772d91b0dfcd70
+        dtype: object
+        """
+        return Series._from_data(
+            {None: self._hash(method=method)}, index=self.index
+        )
 
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
@@ -5456,7 +4712,7 @@ def to_pandas(self, nullable=False, **kwargs):
         dtype: object
         >>> pdf = df.to_pandas(nullable=False)
         >>> pdf
-            a      b
+             a      b
         0  0.0   True
         1  NaN  False
         2  2.0   None
@@ -5940,7 +5196,7 @@ def quantile(
         b    3.7
         Name: 0.1, dtype: float64
         >>> df.quantile([.1, .5])
-            a     b
+               a     b
         0.1  1.3   3.7
         0.5  2.5  55.0
         """  # noqa: E501
@@ -6167,8 +5423,7 @@ def isin(self, values):
     # Stats
     #
     def _prepare_for_rowwise_op(self, method, skipna):
-        """Prepare a DataFrame for CuPy-based row-wise operations.
-        """
+        """Prepare a DataFrame for CuPy-based row-wise operations."""
 
         if method not in _cupy_nan_methods_map and any(
             col.nullable for col in self._columns
@@ -6794,20 +6049,12 @@ def cov(self, **kwargs):
         return df
 
     def corr(self):
-        """Compute the correlation matrix of a DataFrame.
-        """
+        """Compute the correlation matrix of a DataFrame."""
         corr = cupy.corrcoef(self.values, rowvar=False)
         df = DataFrame(cupy.asfortranarray(corr)).set_index(self.columns)
         df.columns = self.columns
         return df
 
-    def to_dict(self, orient="dict", into=dict):
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `to_dict()` method. Consider using "
-            "`.to_pandas().to_dict()` to construct a Python dictionary."
-        )
-
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -6904,7 +6151,7 @@ def append(
 
         See Also
         --------
-        cudf.core.reshape.concat : General function to concatenate DataFrame or
+        cudf.concat : General function to concatenate DataFrame or
             objects.
 
         Notes
@@ -7044,14 +6291,6 @@ def unstack(self, level=-1, fill_value=None):
             self, level=level, fill_value=fill_value
         )
 
-    def equals(self, other):
-        if not isinstance(other, DataFrame):
-            return False
-        for self_name, other_name in zip(self._data.names, other._data.names):
-            if self_name != other_name:
-                return False
-        return super().equals(other)
-
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -7099,6 +6338,119 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
 
+def make_binop_func(op, postprocess=None):
+    # This function is used to wrap binary operations in Frame with an
+    # appropriate API for DataFrame as required for pandas compatibility. The
+    # main effect is reordering and error-checking parameters in
+    # DataFrame-specific ways. The postprocess argument is a callable that may
+    # optionally be provided to modify the result of the binop if additional
+    # processing is needed for pandas compatibility. The callable must have the
+    # signature
+    # def postprocess(left, right, output)
+    # where left and right are the inputs to the binop and output is the result
+    # of calling the wrapped Frame binop.
+    wrapped_func = getattr(Frame, op)
+
+    @functools.wraps(wrapped_func)
+    def wrapper(self, other, axis="columns", level=None, fill_value=None):
+        if axis not in (1, "columns"):
+            raise NotImplementedError("Only axis=1 supported at this time.")
+        output = wrapped_func(self, other, axis, level, fill_value)
+        if postprocess is None:
+            return output
+        return postprocess(self, other, output)
+
+    # functools.wraps copies module level attributes to `wrapper` and sets
+    # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
+    # string of a function by recursively delving into __wrapped__ until
+    # it hits the first function that has __signature__ attribute set. To make
+    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # we directly set the __signature__ attribute of `wrapper` below.
+
+    new_sig = inspect.signature(
+        lambda self, other, axis="columns", level=None, fill_value=None: None
+    )
+
+    wrapper.__signature__ = new_sig
+    return wrapper
+
+
+# Wrap arithmetic Frame binop functions with the expected API for Series.
+for binop in [
+    "add",
+    "radd",
+    "subtract",
+    "sub",
+    "rsub",
+    "multiply",
+    "mul",
+    "rmul",
+    "mod",
+    "rmod",
+    "pow",
+    "rpow",
+    "floordiv",
+    "rfloordiv",
+    "truediv",
+    "div",
+    "divide",
+    "rtruediv",
+    "rdiv",
+]:
+    setattr(DataFrame, binop, make_binop_func(binop))
+
+
+def _make_replacement_func(value):
+    # This function generates a postprocessing function suitable for use with
+    # make_binop_func that fills null columns with the desired fill value.
+
+    def func(left, right, output):
+        # This function may be passed as the postprocess argument to
+        # make_binop_func. Columns that are only present in one of the inputs
+        # will be null in the output. This function postprocesses the output to
+        # replace those nulls with some desired output.
+        if isinstance(right, Series):
+            uncommon_columns = set(left._column_names) ^ set(right.index)
+        elif isinstance(right, DataFrame):
+            uncommon_columns = set(left._column_names) ^ set(
+                right._column_names
+            )
+        elif _is_scalar_or_zero_d_array(right):
+            for name, col in output._data.items():
+                output._data[name] = col.fillna(value)
+            return output
+        else:
+            return output
+
+        for name in uncommon_columns:
+            output._data[name] = column.full(
+                size=len(output), fill_value=value, dtype="bool"
+            )
+        return output
+
+    return func
+
+
+# The ne comparator needs special postprocessing because elements that missing
+# in one operand should be treated as null and result in True in the output
+# rather than simply propagating nulls.
+DataFrame.ne = make_binop_func("ne", _make_replacement_func(True))
+
+
+# All other comparison operators needs return False when one of the operands is
+# missing in the input.
+for binop in [
+    "eq",
+    "lt",
+    "le",
+    "gt",
+    "ge",
+]:
+    setattr(
+        DataFrame, binop, make_binop_func(binop, _make_replacement_func(False))
+    )
+
+
 def from_pandas(obj, nan_as_null=None):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -7274,11 +6626,11 @@ def _setitem_with_dataframe(
     mask: Optional[cudf.core.column.ColumnBase] = None,
 ):
     """
-        This function sets item dataframes relevant columns with replacement df
-        :param input_df: Dataframe to be modified inplace
-        :param replace_df: Replacement DataFrame to replace values with
-        :param input_cols: columns to replace in the input dataframe
-        :param mask: boolean mask in case of masked replacing
+    This function sets item dataframes relevant columns with replacement df
+    :param input_df: Dataframe to be modified inplace
+    :param replace_df: Replacement DataFrame to replace values with
+    :param input_cols: columns to replace in the input dataframe
+    :param mask: boolean mask in case of masked replacing
     """
 
     if input_cols is None:
@@ -7458,7 +6810,9 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=as_column(cols[name].base_data, dtype=cols[name].dtype),
+                codes=build_column(
+                    cols[name].base_data, dtype=cols[name].dtype
+                ),
                 mask=cols[name].base_mask,
                 offset=cols[name].offset,
                 size=cols[name].size,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 12fe7f313eb..99da216d392 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -30,11 +30,10 @@
 from cudf._typing import ColumnLike, DataFrameOrSeries, Dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
+    is_bool_dtype,
     is_decimal_dtype,
     is_dict_like,
     is_integer_dtype,
-    is_list_like,
     is_scalar,
     issubdtype,
 )
@@ -48,7 +47,7 @@
 )
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.join import merge
-from cudf.core.udf.pipeline import compile_or_get
+from cudf.core.udf.pipeline import compile_or_get, supported_cols_from_frame
 from cudf.core.window import Rolling
 from cudf.utils import ioutils
 from cudf.utils.docutils import copy_docstring
@@ -58,8 +57,7 @@
 
 
 class Frame:
-    """
-    Frame: A collection of Column objects with an optional index.
+    """A collection of Column objects with an optional index.
 
     Parameters
     ----------
@@ -70,6 +68,8 @@ class Frame:
     """
 
     _data: "ColumnAccessor"
+    # TODO: Once all dependence on Frame having an index is removed, this
+    # attribute should be moved to IndexedFrame.
     _index: Optional[cudf.core.index.BaseIndex]
 
     def __init__(self, data=None, index=None):
@@ -385,7 +385,7 @@ def copy(self: T, deep: bool = True) -> T:
         b    2
         dtype: int64
         """
-        new_frame = self.__class__.__new__(type(self))
+        new_frame = self.__class__.__new__(self.__class__)
         new_frame._data = self._data.copy(deep=deep)
 
         if self._index is not None:
@@ -500,95 +500,6 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             res.index.names = self._index.names
         return res
 
-    def _sort_index(
-        self,
-        axis=0,
-        level=None,
-        ascending=True,
-        inplace=False,
-        kind=None,
-        na_position="last",
-        sort_remaining=True,
-        ignore_index=False,
-    ):
-        """
-        Helper for .sort_index
-
-        Parameters
-        ----------
-        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
-            The axis along which to sort. The value 0 identifies the rows,
-            and 1 identifies the columns.
-        level : int or level name or list of ints or list of level names
-            If not None, sort on values in specified index level(s).
-            This is only useful in the case of MultiIndex.
-        ascending : bool, default True
-            Sort ascending vs. descending.
-        inplace : bool, default False
-            If True, perform operation in-place.
-        kind : sorting method such as `quick sort` and others.
-            Not yet supported.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            Puts NaNs at the beginning if first; last puts NaNs at the end.
-        sort_remaining : bool, default True
-            Not yet supported
-        ignore_index : bool, default False
-            if True, index will be replaced with RangeIndex.
-
-        Returns
-        -------
-        DataFrame/Series or None
-        """
-        if kind is not None:
-            raise NotImplementedError("kind is not yet supported")
-
-        if not sort_remaining:
-            raise NotImplementedError(
-                "sort_remaining == False is not yet supported"
-            )
-
-        if axis in (0, "index"):
-            if isinstance(self.index, cudf.MultiIndex):
-                if level is None:
-                    midx_data = self.index.to_frame(index=False)
-                else:
-                    # Pandas currently don't handle na_position
-                    # in case of MultiIndex
-                    if ascending is True:
-                        na_position = "first"
-                    else:
-                        na_position = "last"
-
-                    if cudf.api.types.is_list_like(level):
-                        labels = [
-                            self.index._get_level_label(lvl) for lvl in level
-                        ]
-                    else:
-                        labels = [self.index._get_level_label(level)]
-                    midx_data = cudf.DataFrame._from_data(
-                        self.index._data.select_by_label(labels)
-                    )
-                inds = midx_data.argsort(
-                    ascending=ascending, na_position=na_position
-                )
-                outdf = self.take(inds)
-            elif (ascending and self.index.is_monotonic_increasing) or (
-                not ascending and self.index.is_monotonic_decreasing
-            ):
-                outdf = self.copy()
-            else:
-                inds = self.index.argsort(
-                    ascending=ascending, na_position=na_position
-                )
-                outdf = self.take(inds)
-        else:
-            labels = sorted(self._data.names, reverse=not ascending)
-            outdf = self[labels]
-
-        if ignore_index is True:
-            outdf = outdf.reset_index(drop=True)
-        return self._mimic_inplace(outdf, inplace=inplace)
-
     def _get_columns_by_label(self, labels, downcast=False):
         """
         Returns columns of the Frame specified by `labels`
@@ -619,12 +530,13 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
         )
 
         result._copy_type_metadata(self, include_index=keep_index)
+        result._data.names = self._data.names
         if keep_index and self._index is not None:
             result._index.names = self._index.names
         return result
 
-    def _hash(self, initial_hash_values=None):
-        return libcudf.hash.hash(self, initial_hash_values)
+    def _hash(self, method, initial_hash=None):
+        return libcudf.hash.hash(self, method, initial_hash)
 
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True
@@ -1643,18 +1555,21 @@ def _quantiles(
         return result
 
     @annotate("APPLY", color="purple", domain="cudf_python")
-    def _apply(self, func):
+    def _apply(self, func, *args):
         """
         Apply `func` across the rows of the frame.
         """
-        kernel, retty = compile_or_get(self, func)
+        kernel, retty = compile_or_get(self, func, args)
 
         # Mask and data column preallocated
         ans_col = cupy.empty(len(self), dtype=retty)
         ans_mask = cudf.core.column.column_empty(len(self), dtype="bool")
-        launch_args = [(ans_col, ans_mask)]
+        launch_args = [(ans_col, ans_mask), len(self)]
         offsets = []
-        for col in self._data.values():
+
+        # if compile_or_get succeeds, it is safe to create a kernel that only
+        # consumes the columns that are of supported dtype
+        for col in supported_cols_from_frame(self).values():
             data = col.data
             mask = col.mask
             if mask is None:
@@ -1663,12 +1578,12 @@ def _apply(self, func):
                 launch_args.append((data, mask))
             offsets.append(col.offset)
         launch_args += offsets
-        launch_args.append(len(self))  # size
+        launch_args += list(args)
         kernel.forall(len(self))(*launch_args)
 
-        result = cudf.Series(ans_col).set_mask(
-            libcudf.transform.bools_to_mask(ans_mask)
-        )
+        col = as_column(ans_col)
+        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        result = cudf.Series._from_data({None: col}, self._index)
 
         return result
 
@@ -1829,147 +1744,6 @@ def repeat(self, repeats, axis=None):
 
         return self._repeat(repeats)
 
-    @annotate("SORT_INDEX", color="red", domain="cudf_python")
-    def sort_index(
-        self,
-        axis=0,
-        level=None,
-        ascending=True,
-        inplace=False,
-        kind=None,
-        na_position="last",
-        sort_remaining=True,
-        ignore_index=False,
-        key=None,
-    ):
-        """Sort object by labels (along an axis).
-
-        Parameters
-        ----------
-        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
-            The axis along which to sort. The value 0 identifies the rows,
-            and 1 identifies the columns.
-        level : int or level name or list of ints or list of level names
-            If not None, sort on values in specified index level(s).
-            This is only useful in the case of MultiIndex.
-        ascending : bool, default True
-            Sort ascending vs. descending.
-        inplace : bool, default False
-            If True, perform operation in-place.
-        kind : sorting method such as `quick sort` and others.
-            Not yet supported.
-        na_position : {‘first’, ‘last’}, default ‘last’
-            Puts NaNs at the beginning if first; last puts NaNs at the end.
-        sort_remaining : bool, default True
-            Not yet supported
-        ignore_index : bool, default False
-            if True, index will be replaced with RangeIndex.
-        key : callable, optional
-            If not None, apply the key function to the index values before
-            sorting. This is similar to the key argument in the builtin
-            sorted() function, with the notable difference that this key
-            function should be vectorized. It should expect an Index and return
-            an Index of the same shape. For MultiIndex inputs, the key is
-            applied per level.
-
-        Returns
-        -------
-        Frame or None
-
-        Notes
-        -----
-        Difference from pandas:
-          * Not supporting: kind, sort_remaining=False
-
-        Examples
-        --------
-        **Series**
-        >>> import cudf
-        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
-        >>> series
-        3    a
-        2    b
-        1    c
-        4    d
-        dtype: object
-        >>> series.sort_index()
-        1    c
-        2    b
-        3    a
-        4    d
-        dtype: object
-
-        Sort Descending
-
-        >>> series.sort_index(ascending=False)
-        4    d
-        3    a
-        2    b
-        1    c
-        dtype: object
-
-        **DataFrame**
-        >>> df = cudf.DataFrame(
-        ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
-        >>> df.sort_index(axis=0)
-           b  a
-        1  3  2
-        2  1  3
-        3  2  1
-        >>> df.sort_index(axis=1)
-           a  b
-        1  2  3
-        3  1  2
-        2  3  1
-        """
-        if kind is not None:
-            raise NotImplementedError("kind is not yet supported")
-
-        if not sort_remaining:
-            raise NotImplementedError(
-                "sort_remaining == False is not yet supported"
-            )
-
-        if key is not None:
-            raise NotImplementedError("key is not yet supported.")
-
-        if axis in (0, "index"):
-            idx = self.index
-            if isinstance(idx, cudf.MultiIndex):
-                if level is None:
-                    midx_data = idx.to_frame(index=False)
-                else:
-                    # Pandas doesn't handle na_position in case of MultiIndex.
-                    na_position = "first" if ascending is True else "last"
-                    labels = [
-                        idx._get_level_label(lvl)
-                        for lvl in (level if is_list_like(level) else (level,))
-                    ]
-                    midx_data = cudf.DataFrame._from_data(
-                        idx._data.select_by_label(labels)
-                    )
-
-                inds = midx_data.argsort(
-                    ascending=ascending, na_position=na_position
-                )
-                out = self.take(inds)
-            elif (ascending and idx.is_monotonic_increasing) or (
-                not ascending and idx.is_monotonic_decreasing
-            ):
-                out = self.copy()
-            else:
-                inds = idx.argsort(
-                    ascending=ascending, na_position=na_position
-                )
-                out = self.take(inds)
-        else:
-            labels = sorted(self._data.names, reverse=not ascending)
-            out = self[labels]
-
-        if ignore_index is True:
-            out = out.reset_index(drop=True)
-        return self._mimic_inplace(out, inplace=inplace)
-
     def _repeat(self, count):
         if not is_scalar(count):
             count = as_column(count)
@@ -1981,9 +1755,6 @@ def _repeat(self, count):
         result._copy_type_metadata(self)
         return result
 
-    def _reverse(self):
-        return self.__class__._from_data(*libcudf.copying.reverse(self))
-
     def _fill(self, fill_values, begin, end, inplace):
         col_and_fill = zip(self._columns, fill_values)
 
@@ -1999,8 +1770,7 @@ def _fill(self, fill_values, begin, end, inplace):
         return self
 
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
-        """Shift values by `periods` positions.
-        """
+        """Shift values by `periods` positions."""
         assert axis in (None, 0) and freq is None
         return self._shift(periods)
 
@@ -2054,7 +1824,7 @@ def round(self, decimals=0, how="half_even"):
         ...     columns=['dogs', 'cats']
         ... )
         >>> df
-            dogs  cats
+           dogs  cats
         0  0.21  0.32
         1  0.01  0.67
         2  0.66  0.03
@@ -2064,7 +1834,7 @@ def round(self, decimals=0, how="half_even"):
         of decimal places
 
         >>> df.round(1)
-            dogs  cats
+           dogs  cats
         0   0.2   0.3
         1   0.0   0.7
         2   0.7   0.0
@@ -2075,7 +1845,7 @@ def round(self, decimals=0, how="half_even"):
         places as value
 
         >>> df.round({'dogs': 1, 'cats': 0})
-            dogs  cats
+           dogs  cats
         0   0.2   0.0
         1   0.0   1.0
         2   0.7   0.0
@@ -2087,7 +1857,7 @@ def round(self, decimals=0, how="half_even"):
 
         >>> decimals = cudf.Series([0, 1], index=['cats', 'dogs'])
         >>> df.round(decimals)
-            dogs  cats
+           dogs  cats
         0   0.2   0.0
         1   0.0   1.0
         2   0.7   0.0
@@ -2865,7 +2635,7 @@ def isnull(self):
         1     6  1939-05-27 00:00:00.000000  Batman  Batmobile
         2  <NA>  1940-04-25 00:00:00.000000              Joker
         >>> df.isnull()
-            age   born   name    toy
+             age   born   name    toy
         0  False   True  False   True
         1  False  False  False  False
         2   True  False  False  False
@@ -3113,6 +2883,9 @@ def searchsorted(
         """
         # Call libcudf++ search_sorted primitive
 
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
+
         scalar_flag = None
         if is_scalar(values):
             scalar_flag = True
@@ -3134,53 +2907,101 @@ def searchsorted(
         else:
             return result
 
-    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
-        """
-        Sort by the values.
+    @annotate("ARGSORT", color="yellow", domain="cudf_python")
+    def argsort(
+        self,
+        by=None,
+        axis=0,
+        kind="quicksort",
+        order=None,
+        ascending=True,
+        na_position="last",
+    ):
+        """Return the integer indices that would sort the Series values.
 
         Parameters
         ----------
-        by: list, optional
-            Labels specifying columns to sort by. By default,
-            sort by all columns of `self`
+        by : str or list of str, default None
+            Name or list of names to sort by. If None, sort by all columns.
+        axis : {0 or "index"}
+            Has no effect but is accepted for compatibility with numpy.
+        kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See :func:`numpy.sort` for more
+            information. 'mergesort' and 'stable' are the only stable
+            algorithms. Only quicksort is supported in cuDF.
+        order : None
+            Has no effect but is accepted for compatibility with numpy.
         ascending : bool or list of bool, default True
             If True, sort values in ascending order, otherwise descending.
         na_position : {‘first’ or ‘last’}, default ‘last’
             Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
             at the end.
+
         Returns
         -------
-        out_column_inds : cuDF Column of indices sorted based on input
-
-        Difference from pandas:
-        * Support axis='index' only.
-        * Not supporting: inplace, kind
-        * Ascending can be a list of bools to control per column
-        """
-
-        # This needs to be updated to handle list of bools for ascending
-        if ascending is True:
-            if na_position == "last":
-                na_position = 0
-            elif na_position == "first":
-                na_position = 1
-        elif ascending is False:
-            if na_position == "last":
-                na_position = 1
-            elif na_position == "first":
-                na_position = 0
-        else:
+        cupy.ndarray: The indices sorted based on input.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> s = cudf.Series([3, 1, 2])
+        >>> s
+        0    3
+        1    1
+        2    2
+        dtype: int64
+        >>> s.argsort()
+        0    1
+        1    2
+        2    0
+        dtype: int32
+        >>> s[s.argsort()]
+        1    1
+        2    2
+        0    3
+        dtype: int64
+
+        **DataFrame**
+        >>> import cudf
+        >>> df = cudf.DataFrame({'foo': [3, 1, 2]})
+        >>> df.argsort()
+        array([1, 2, 0], dtype=int32)
+
+        **Index**
+        >>> import cudf
+        >>> idx = cudf.Index([3, 1, 2])
+        >>> idx.argsort()
+        array([1, 2, 0], dtype=int32)
+        """  # noqa: E501
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
+        if kind != "quicksort":
+            if kind not in {"mergesort", "heapsort", "stable"}:
+                raise AttributeError(
+                    f"{kind} is not a valid sorting algorithm for "
+                    f"'DataFrame' object"
+                )
             warnings.warn(
-                "When using a sequence of booleans for `ascending`, "
-                "`na_position` flag is not yet supported and defaults to "
-                "treating nulls as greater than all numbers"
+                f"GPU-accelerated {kind} is currently not supported, "
+                "defaulting to quicksort."
             )
-            na_position = 0
+
+        if isinstance(by, str):
+            by = [by]
+        return self._get_sorted_inds(
+            by=by, ascending=ascending, na_position=na_position
+        ).values
+
+    def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
+        # Get an int64 column consisting of the indices required to sort self
+        # according to the columns specified in by.
 
         to_sort = (
             self
             if by is None
-            else self._get_columns_by_label(by, downcast=False)
+            else self._get_columns_by_label(list(by), downcast=False)
         )
 
         # If given a scalar need to construct a sequence of length # of columns
@@ -3189,6 +3010,74 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"):
 
         return libcudf.sort.order_by(to_sort, ascending, na_position)
 
+    def take(self, indices, keep_index=None):
+        """Return a new object containing the rows specified by *positions*
+
+        Parameters
+        ----------
+        indices : array-like
+            Array of ints indicating which positions to take.
+        keep_index : bool, default True
+            Whether to retain the index in result or not.
+
+        Returns
+        -------
+        out : Series or DataFrame or Index
+            New object with desired subset of rows.
+
+        Examples
+        --------
+        **Series**
+        >>> s = cudf.Series(['a', 'b', 'c', 'd', 'e'])
+        >>> s.take([2, 0, 4, 3])
+        2    c
+        0    a
+        4    e
+        3    d
+        dtype: object
+
+        **DataFrame**
+
+        >>> a = cudf.DataFrame({'a': [1.0, 2.0, 3.0],
+        ...                    'b': cudf.Series(['a', 'b', 'c'])})
+        >>> a.take([0, 2, 2])
+             a  b
+        0  1.0  a
+        2  3.0  c
+        2  3.0  c
+        >>> a.take([True, False, True])
+             a  b
+        0  1.0  a
+        2  3.0  c
+
+        **Index**
+
+        >>> idx = cudf.Index(['a', 'b', 'c', 'd', 'e'])
+        >>> idx.take([2, 0, 4, 3])
+        StringIndex(['c' 'a' 'e' 'd'], dtype='object')
+        """
+        # TODO: When we remove keep_index we should introduce the axis
+        # parameter. We could also introduce is_copy, but that's already
+        # deprecated in pandas so it's probably unnecessary. We also need to
+        # introduce Index.take's allow_fill and fill_value parameters.
+        if keep_index is not None:
+            warnings.warn(
+                "keep_index is deprecated and will be removed in the future.",
+                FutureWarning,
+            )
+        else:
+            keep_index = True
+
+        indices = as_column(indices)
+        if is_bool_dtype(indices):
+            warnings.warn(
+                "Calling take with a boolean array is deprecated and will be "
+                "removed in the future.",
+                FutureWarning,
+            )
+            return self._apply_boolean_mask(indices)
+        return self._gather(indices, keep_index=keep_index)
+
     def sin(self):
         """
         Get Trigonometric sine, element-wise.
@@ -3718,6 +3607,127 @@ def sqrt(self):
         """
         return self._unaryop("sqrt")
 
+    def abs(self):
+        """
+        Return a Series/DataFrame with absolute numeric value of each element.
+
+        This function only applies to elements that are all numeric.
+
+        Returns
+        -------
+        DataFrame/Series
+            Absolute value of each element.
+
+        Examples
+        --------
+        Absolute numeric values in a Series
+
+        >>> s = cudf.Series([-1.10, 2, -3.33, 4])
+        >>> s.abs()
+        0    1.10
+        1    2.00
+        2    3.33
+        3    4.00
+        dtype: float64
+        """
+        return self._unaryop("abs")
+
+    # Rounding
+    def ceil(self):
+        """
+        Rounds each value upward to the smallest integral value not less
+        than the original.
+
+        Returns
+        -------
+        DataFrame or Series
+            Ceiling value of each element.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([1.1, 2.8, 3.5, 4.5])
+        >>> series
+        0    1.1
+        1    2.8
+        2    3.5
+        3    4.5
+        dtype: float64
+        >>> series.ceil()
+        0    2.0
+        1    3.0
+        2    4.0
+        3    5.0
+        dtype: float64
+        """
+        return self._unaryop("ceil")
+
+    def floor(self):
+        """Rounds each value downward to the largest integral value not greater
+        than the original.
+
+        Returns
+        -------
+        DataFrame or Series
+            Flooring value of each element.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([-1.9, 2, 0.2, 1.5, 0.0, 3.0])
+        >>> series
+        0   -1.9
+        1    2.0
+        2    0.2
+        3    1.5
+        4    0.0
+        5    3.0
+        dtype: float64
+        >>> series.floor()
+        0   -2.0
+        1    2.0
+        2    0.0
+        3    1.0
+        4    0.0
+        5    3.0
+        dtype: float64
+        """
+        return self._unaryop("floor")
+
+    def scale(self):
+        """
+        Scale values to [0, 1] in float64
+
+        Returns
+        -------
+        DataFrame or Series
+            Values scaled to [0, 1].
+
+        Examples
+        --------
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12, 0.5, 1])
+        >>> series
+        0    10.0
+        1    11.0
+        2    12.0
+        3     0.5
+        4     1.0
+        dtype: float64
+        >>> series.scale()
+        0    0.826087
+        1    0.913043
+        2    1.000000
+        3    0.000000
+        4    0.043478
+        dtype: float64
+        """
+        vmin = self.min()
+        vmax = self.max()
+        scaled = (self - vmin) / (vmax - vmin)
+        scaled._index = self._index.copy(deep=False)
+        return scaled
+
     def _merge(
         self,
         right,
@@ -5209,12 +5219,12 @@ def nans_to_nulls(self):
         >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False)
         >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False)
         >>> df
-            a     b
+              a     b
         0   1.0  <NA>
         1  <NA>  3.14
         2   NaN   NaN
         >>> df.nans_to_nulls()
-            a     b
+              a     b
         0   1.0  <NA>
         1  <NA>  3.14
         2  <NA>  <NA>
@@ -5237,328 +5247,1602 @@ def __invert__(self):
             self._index,
         )
 
+    def add(self, other, axis, level=None, fill_value=None):
+        """
+        Get Addition of dataframe or series and other, element-wise (binary
+        operator `add`).
 
-class SingleColumnFrame(Frame):
-    """A one-dimensional frame.
+        Equivalent to ``frame + other``, but with support to substitute a
+        ``fill_value`` for missing data in one of the inputs.
 
-    Frames with only a single column share certain logic that is encoded in
-    this class.
-    """
+        Parameters
+        ----------
 
-    _SUPPORT_AXIS_LOOKUP = {
-        0: 0,
-        None: 0,
-        "index": 0,
-    }
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
-    def _reduce(
-        self, op, axis=None, level=None, numeric_only=None, **kwargs,
-    ):
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
 
-        if level is not None:
-            raise NotImplementedError("level parameter is not implemented yet")
+        Examples
+        --------
 
-        if numeric_only not in (None, True):
-            raise NotImplementedError(
-                "numeric_only parameter is not implemented yet"
-            )
-        return getattr(self._column, op)(**kwargs)
+        **DataFrame**
 
-    def _scan(self, op, axis=None, *args, **kwargs):
-        if axis not in (None, 0):
-            raise NotImplementedError("axis parameter is not implemented yet")
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df + 1
+                   angles  degrees
+        circle          1      361
+        triangle        4      181
+        rectangle       5      361
+        >>> df.add(1)
+                   angles  degrees
+        circle          1      361
+        triangle        4      181
+        rectangle       5      361
 
-        return super()._scan(op, axis=axis, *args, **kwargs)
+        **Series**
 
-    @classmethod
-    def _from_data(
-        cls,
-        data: MutableMapping,
-        index: Optional[cudf.core.index.BaseIndex] = None,
-        name: Any = None,
-    ):
+        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> a.add(b)
+        a       2
+        b    <NA>
+        c    <NA>
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.add(b, fill_value=0)
+        a       2
+        b       1
+        c       1
+        d       1
+        e    <NA>
+        dtype: int64
+        """
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
 
-        out = super()._from_data(data, index)
-        if name is not None:
-            out.name = name
-        return out
+        return self._binaryop(other, "add", fill_value)
 
-    @property
-    def name(self):
-        """The name of this object."""
-        return next(iter(self._data.names))
+    def radd(self, other, axis, level=None, fill_value=None):
+        """
+        Get Addition of dataframe or series and other, element-wise (binary
+        operator `radd`).
 
-    @name.setter
-    def name(self, value):
-        self._data[value] = self._data.pop(self.name)
+        Equivalent to ``other + frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `add`.
 
-    @property
-    def ndim(self):
-        """Dimension of the data (always 1)."""
-        return 1
+        Parameters
+        ----------
 
-    @property
-    def shape(self):
-        """Returns a tuple representing the dimensionality of the Index.
-        """
-        return (len(self),)
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
-    def __iter__(self):
-        """
-        Iterating over a GPU object is not effecient and hence not supported.
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
 
-        Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host``
-        if you wish to iterate over the values.
-        """
-        cudf.utils.utils.raise_iteration_error(obj=self)
+        Examples
+        --------
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
+        **DataFrame**
 
-    @property
-    def _num_columns(self):
-        return 1
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df + 1
+                   angles  degrees
+        circle          1      361
+        triangle        4      181
+        rectangle       5      361
+        >>> df.radd(1)
+                   angles  degrees
+        circle          1      361
+        triangle        4      181
+        rectangle       5      361
 
-    @property
-    def _column(self):
-        return self._data[self.name]
+        **Series**
 
-    @_column.setter
-    def _column(self, value):
-        self._data[self.name] = value
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       1
+        e    <NA>
+        dtype: int64
+        >>> a.add(b, fill_value=0)
+        a       2
+        b       2
+        c       3
+        d       1
+        e    <NA>
+        dtype: int64
 
-    @property
-    def values(self):
-        return self._column.values
+        """
 
-    @property
-    def values_host(self):
-        return self._column.values_host
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
 
-    def to_cupy(
-        self,
-        dtype: Union[Dtype, None] = None,
-        copy: bool = True,
-        na_value=None,
-    ) -> cupy.ndarray:
-        return super().to_cupy(dtype, copy, na_value).flatten()
+        return self._binaryop(other, "add", fill_value, reflect=True)
 
-    def to_numpy(
-        self,
-        dtype: Union[Dtype, None] = None,
-        copy: bool = True,
-        na_value=None,
-    ) -> np.ndarray:
-        return super().to_numpy(dtype, copy, na_value).flatten()
+    def subtract(self, other, axis, level=None, fill_value=None):
+        """
+        Get Subtraction of dataframe or series and other, element-wise (binary
+        operator `sub`).
 
-    def tolist(self):
+        Equivalent to ``frame - other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rsub`.
 
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via the `tolist()` method. Consider using "
-            "`.to_arrow().to_pylist()` to construct a Python list."
-        )
+        Parameters
+        ----------
 
-    to_list = tolist
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
-    # TODO: When this method is removed we can also remove
-    # ColumnBase.to_gpu_array.
-    def to_gpu_array(self, fillna=None):
-        warnings.warn(
-            "The to_gpu_array method will be removed in a future cuDF "
-            "release. Consider using `to_cupy` instead.",
-            DeprecationWarning,
-        )
-        return self._column.to_gpu_array(fillna=fillna)
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
 
-    @classmethod
-    def from_arrow(cls, array):
-        """Create from PyArrow Array/ChunkedArray.
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df.sub(1)
+                   angles  degrees
+        circle         -1      359
+        triangle        2      179
+        rectangle       3      359
+        >>> df.sub([1, 2])
+                   angles  degrees
+        circle         -1      358
+        triangle        2      178
+        rectangle       3      358
+
+        **Series**
+
+        >>> a = cudf.Series([10, 20, None, 30, None], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, 30], index=['a', 'c', 'b', 'd'])
+        >>> b
+        a       1
+        c    <NA>
+        b       2
+        d      30
+        dtype: int64
+        >>> a.subtract(b, fill_value=2)
+        a       9
+        b      18
+        c    <NA>
+        d       0
+        e    <NA>
+        dtype: int64
+
+        """  # noqa: E501
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "sub", fill_value)
+
+    sub = subtract
+
+    def rsub(self, other, axis, level=None, fill_value=None):
+        """
+        Get Subtraction of dataframe or series and other, element-wise (binary
+        operator `rsub`).
+
+        Equivalent to ``other - frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `sub`.
 
         Parameters
         ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted.
 
-        Raises
-        ------
-        TypeError for invalid input type.
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        SingleColumnFrame
+        DataFrame or Series
+            Result of the arithmetic operation.
 
         Examples
         --------
+
+        **DataFrame**
+
         >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
-        StringIndex(['a' 'b' None], dtype='object')
-        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
-        0       a
-        1       b
-        2    <NA>
-        dtype: object
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df
+                   angles  degrees
+        circle          0      360
+        triangle        3      180
+        rectangle       4      360
+        >>> df.rsub(1)
+                   angles  degrees
+        circle          1     -359
+        triangle       -2     -179
+        rectangle      -3     -359
+        >>> df.rsub([1, 2])
+                   angles  degrees
+        circle          1     -358
+        triangle       -2     -178
+        rectangle      -3     -358
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.rsub(b, fill_value=10)
+        a       0
+        b       8
+        c       7
+        d      -8
+        e    <NA>
+        dtype: int64
         """
-        return cls(ColumnBase.from_arrow(array))
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
 
-    def to_arrow(self):
+        return self._binaryop(other, "sub", fill_value, reflect=True)
+
+    def multiply(self, other, axis, level=None, fill_value=None):
         """
-        Convert to a PyArrow Array.
+        Get Multiplication of dataframe or series and other, element-wise
+        (binary operator `mul`).
+
+        Equivalent to ``frame * other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rmul`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        PyArrow Array
+        DataFrame or Series
+            Result of the arithmetic operation.
 
         Examples
         --------
+
+        **DataFrame**
+
         >>> import cudf
-        >>> sr = cudf.Series(["a", "b", None])
-        >>> sr.to_arrow()
-        <pyarrow.lib.StringArray object at 0x7f796b0e7600>
-        [
-          "a",
-          "b",
-          null
-        ]
-        >>> ind = cudf.Index(["a", "b", None])
-        >>> ind.to_arrow()
-        <pyarrow.lib.StringArray object at 0x7f796b0e7750>
-        [
-          "a",
-          "b",
-          null
-        ]
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
+        ...                      index=['circle', 'triangle', 'rectangle'])
+        >>> df * other
+                   angles degrees
+        circle          0    <NA>
+        triangle        9    <NA>
+        rectangle      16    <NA>
+        >>> df.mul(other, fill_value=0)
+                   angles  degrees
+        circle          0        0
+        triangle        9        0
+        rectangle      16        0
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.multiply(b, fill_value=0)
+        a       1
+        b       0
+        c       0
+        d       0
+        e    <NA>
+        dtype: int64
+
         """
-        return self._column.to_arrow()
 
-    @property
-    def is_unique(self):
-        """Return boolean if values in the object are unique.
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "mul", fill_value)
+
+    mul = multiply
+
+    def rmul(self, other, axis, level=None, fill_value=None):
+        """
+        Get Multiplication of dataframe or series and other, element-wise
+        (binary operator `rmul`).
+
+        Equivalent to ``other * frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `mul`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        bool
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> other = cudf.DataFrame({'angles': [0, 3, 4]},
+        ...                      index=['circle', 'triangle', 'rectangle'])
+        >>> other * df
+                   angles degrees
+        circle          0    <NA>
+        triangle        9    <NA>
+        rectangle      16    <NA>
+        >>> df.rmul(other, fill_value=0)
+                   angles  degrees
+        circle          0        0
+        triangle        9        0
+        rectangle      16        0
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e      40
+        dtype: int64
+        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
+        >>> b
+        a    <NA>
+        b       1
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        >>> a.rmul(b, fill_value=2)
+        a      20
+        b      20
+        c    <NA>
+        d     600
+        e     200
+        f       8
+        dtype: int64
+        """  # noqa E501
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "mul", fill_value, reflect=True)
+
+    def mod(self, other, axis, level=None, fill_value=None):
         """
-        return self._column.is_unique
+        Get Modulo division of dataframe or series and other, element-wise
+        (binary operator `mod`).
 
-    @property
-    def is_monotonic(self):
-        """Return boolean if values in the object are monotonically increasing.
+        Equivalent to ``frame % other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rmod`.
 
-        This property is an alias for :attr:`is_monotonic_increasing`.
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        bool
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df % 100
+                   angles  degrees
+        circle          0       60
+        triangle        3       80
+        rectangle       4       60
+        >>> df.mod(100)
+                   angles  degrees
+        circle          0       60
+        triangle        3       80
+        rectangle       4       60
+
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([10, 20, 30])
+        >>> series
+        0    10
+        1    20
+        2    30
+        dtype: int64
+        >>> series.mod(4)
+        0    2
+        1    0
+        2    2
+        dtype: int64
+
+
+        """
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "mod", fill_value)
+
+    def rmod(self, other, axis, level=None, fill_value=None):
         """
-        return self.is_monotonic_increasing
+        Get Modulo division of dataframe or series and other, element-wise
+        (binary operator `rmod`).
 
-    @property
-    def is_monotonic_increasing(self):
-        """Return boolean if values in the object are monotonically increasing.
+        Equivalent to ``other % frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `mod`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        bool
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> 100 % df
+                   angles  degrees
+        circle          0      100
+        triangle        1      100
+        rectangle       0      100
+        >>> df.rmod(100)
+                   angles  degrees
+        circle          0      100
+        triangle        1      100
+        rectangle       0      100
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        e      40
+        dtype: int64
+        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
+        >>> b
+        a    <NA>
+        b       1
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        >>> a.rmod(b, fill_value=10)
+        a       0
+        b       1
+        c    <NA>
+        d      20
+        e       5
+        f       4
+        dtype: int64
+        """  # noqa E501
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "mod", fill_value, reflect=True)
+
+    def pow(self, other, axis, level=None, fill_value=None):
         """
-        return self._column.is_monotonic_increasing
+        Get Exponential power of dataframe series and other, element-wise
+        (binary operator `pow`).
 
-    @property
-    def is_monotonic_decreasing(self):
-        """Return boolean if values in the object are monotonically decreasing.
+        Equivalent to ``frame ** other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rpow`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        bool
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df ** 2
+                   angles  degrees
+        circle          0   129600
+        triangle        9    32400
+        rectangle      16   129600
+        >>> df.pow(2)
+                   angles  degrees
+        circle          0   129600
+        triangle        9    32400
+        rectangle      16   129600
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a      10
+        b    <NA>
+        d      12
+        e    <NA>
+        dtype: int64
+        >>> a.pow(b, fill_value=0)
+        a       1
+        b       1
+        c       1
+        d       0
+        e    <NA>
+        dtype: int64
         """
-        return self._column.is_monotonic_decreasing
 
-    @property
-    def __cuda_array_interface__(self):
-        return self._column.__cuda_array_interface__
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "pow", fill_value)
+
+    def rpow(self, other, axis, level=None, fill_value=None):
+        """
+        Get Exponential power of dataframe or series and other, element-wise
+        (binary operator `pow`).
 
-    def factorize(self, na_sentinel=-1):
-        """Encode the input values as integer labels
+        Equivalent to ``other ** frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `pow`.
 
         Parameters
         ----------
-        na_sentinel : number
-            Value to indicate missing category.
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
         --------
-        (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
-            - *labels* contains the encoded values
-            - *cats* contains the categories in order that the N-th
-              item corresponds to the (N-1) code.
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> 1 ** df
+                   angles  degrees
+        circle          1        1
+        triangle        1        1
+        rectangle       1        1
+        >>> df.rpow(1)
+                   angles  degrees
+        circle          1        1
+        triangle        1        1
+        rectangle       1        1
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       2
+        c       3
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a      10
+        b    <NA>
+        d      12
+        e    <NA>
+        dtype: int64
+        >>> a.rpow(b, fill_value=0)
+        a      10
+        b       0
+        c       0
+        d       1
+        e    <NA>
+        dtype: int64
+        """
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "pow", fill_value, reflect=True)
+
+    def floordiv(self, other, axis, level=None, fill_value=None):
+        """
+        Get Integer division of dataframe or series and other, element-wise
+        (binary operator `floordiv`).
+
+        Equivalent to ``frame // other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rfloordiv`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
+
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
 
         Examples
         --------
+
+        **DataFrame**
+
         >>> import cudf
-        >>> s = cudf.Series(['a', 'a', 'c'])
-        >>> codes, uniques = s.factorize()
-        >>> codes
-        array([0, 0, 1], dtype=int8)
-        >>> uniques
-        StringIndex(['a' 'c'], dtype='object')
+        >>> df = cudf.DataFrame({'angles': [1, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df.floordiv(2)
+                   angles  degrees
+        circle          0      180
+        triangle        1       90
+        rectangle       2      180
+        >>> df // 2
+                   angles  degrees
+        circle          0      180
+        triangle        1       90
+        rectangle       2      180
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b       1
+        c       1
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       1
+        e    <NA>
+        dtype: int64
+        >>> a.floordiv(b)
+        a       1
+        b    <NA>
+        c    <NA>
+        d    <NA>
+        e    <NA>
+        dtype: int64
         """
-        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
 
-    def _make_operands_for_binop(
-        self,
-        other: T,
-        fill_value: Any = None,
-        reflect: bool = False,
-        *args,
-        **kwargs,
-    ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]:
-        """Generate the dictionary of operands used for a binary operation.
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "floordiv", fill_value)
+
+    def rfloordiv(self, other, axis, level=None, fill_value=None):
+        """
+        Get Integer division of dataframe or series and other, element-wise
+        (binary operator `rfloordiv`).
+
+        Equivalent to ``other // dataframe``, but with support to substitute
+        a fill_value for missing data in one of the inputs. With reverse
+        version, `floordiv`.
 
         Parameters
         ----------
-        other : SingleColumnFrame
-            The second operand.
-        fill_value : Any, default None
-            The value to replace null values with. If ``None``, nulls are not
-            filled before the operation.
-        reflect : bool, default False
-            If ``True``, swap the order of the operands. See
-            https://docs.python.org/3/reference/datamodel.html#object.__ror__
-            for more information on when this is necessary.
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
 
         Returns
         -------
-        Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]
-            The operands to be passed to _colwise_binop.
-        """
-        # Get the appropriate name for output operations involving two objects
-        # that are Series-like objects. The output shares the lhs's name unless
-        # the rhs is a _differently_ named Series-like object.
-        if (
-            isinstance(other, (SingleColumnFrame, pd.Series, pd.Index))
-            and self.name != other.name
-        ):
-            result_name = None
-        else:
-            result_name = self.name
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'col1': [10, 11, 23],
+        ... 'col2': [101, 122, 321]})
+        >>> df
+           col1  col2
+        0    10   101
+        1    11   122
+        2    23   321
+        >>> df.rfloordiv(df)
+           col1  col2
+        0     1     1
+        1     1     1
+        2     1     1
+        >>> df.rfloordiv(200)
+           col1  col2
+        0    20     1
+        1    18     1
+        2     8     0
+        >>> df.rfloordiv(100)
+           col1  col2
+        0    10     0
+        1     9     0
+        2     4     0
+
+        **Series**
+
+        >>> import cudf
+        >>> s = cudf.Series([1, 2, 10, 17])
+        >>> s
+        0     1
+        1     2
+        2    10
+        3    17
+        dtype: int64
+        >>> s.rfloordiv(100)
+        0    100
+        1     50
+        2     10
+        3      5
+        dtype: int64
+        >>> s = cudf.Series([10, 20, None])
+        >>> s
+        0      10
+        1      20
+        2    <NA>
+        dtype: int64
+        >>> s.rfloordiv(200)
+        0      20
+        1      10
+        2    <NA>
+        dtype: int64
+        >>> s.rfloordiv(200, fill_value=2)
+        0     20
+        1     10
+        2    100
+        dtype: int64
+        """
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "floordiv", fill_value, reflect=True)
+
+    def truediv(self, other, axis, level=None, fill_value=None):
+        """
+        Get Floating division of dataframe or series and other, element-wise
+        (binary operator `truediv`).
+
+        Equivalent to ``frame / other``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `rtruediv`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
+
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df.truediv(10)
+                   angles  degrees
+        circle        0.0     36.0
+        triangle      0.3     18.0
+        rectangle     0.4     36.0
+        >>> df.div(10)
+                   angles  degrees
+        circle        0.0     36.0
+        triangle      0.3     18.0
+        rectangle     0.4     36.0
+        >>> df / 10
+                   angles  degrees
+        circle        0.0     36.0
+        triangle      0.3     18.0
+        rectangle     0.4     36.0
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([1, 10, 20, None], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a       1
+        b      10
+        c      20
+        d    <NA>
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e    <NA>
+        dtype: int64
+        >>> a.truediv(b, fill_value=0)
+        a     1.0
+        b     Inf
+        c     Inf
+        d     0.0
+        e    <NA>
+        dtype: float64
+        """
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "truediv", fill_value)
+
+    # Alias for truediv
+    div = truediv
+    divide = truediv
+
+    def rtruediv(self, other, axis, level=None, fill_value=None):
+        """
+        Get Floating division of dataframe or series and other, element-wise
+        (binary operator `rtruediv`).
+
+        Equivalent to ``other / frame``, but with support to substitute a
+        fill_value for missing data in one of the inputs. With reverse
+        version, `truediv`.
+
+        Parameters
+        ----------
+
+        other : scalar, sequence, Series, or DataFrame
+            Any single or multiple element data structure, or list-like object.
+        axis : int or string
+            Only ``0`` is supported for series, ``1`` or ``columns`` supported
+            for dataframe
+        fill_value  : float or None, default None
+            Fill existing missing (NaN) values, and any new element needed
+            for successful DataFrame alignment, with this value before
+            computation. If data in both corresponding DataFrame locations
+            is missing the result will be missing.
+
+        Returns
+        -------
+        DataFrame or Series
+            Result of the arithmetic operation.
+
+        Examples
+        --------
+
+        **DataFrame**
+
+        >>> import cudf
+        >>> df = cudf.DataFrame({'angles': [0, 3, 4],
+        ...                    'degrees': [360, 180, 360]},
+        ...                   index=['circle', 'triangle', 'rectangle'])
+        >>> df
+                   angles  degrees
+        circle          0      360
+        triangle        3      180
+        rectangle       4      360
+        >>> df.rtruediv(10)
+                     angles   degrees
+        circle          inf  0.027778
+        triangle   3.333333  0.055556
+        rectangle  2.500000  0.027778
+        >>> df.rdiv(10)
+                     angles   degrees
+        circle          inf  0.027778
+        triangle   3.333333  0.055556
+        rectangle  2.500000  0.027778
+        >>> 10 / df
+                     angles   degrees
+        circle          inf  0.027778
+        triangle   3.333333  0.055556
+        rectangle  2.500000  0.027778
+
+        **Series**
+
+        >>> import cudf
+        >>> a = cudf.Series([10, 20, None, 30], index=['a', 'b', 'c', 'd'])
+        >>> a
+        a      10
+        b      20
+        c    <NA>
+        d      30
+        dtype: int64
+        >>> b = cudf.Series([1, None, 2, 3], index=['a', 'b', 'd', 'e'])
+        >>> b
+        a       1
+        b    <NA>
+        d       2
+        e       3
+        dtype: int64
+        >>> a.rtruediv(b, fill_value=0)
+        a            0.1
+        b            0.0
+        c           <NA>
+        d    0.066666667
+        e            Inf
+        dtype: float64
+        """
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not supported yet.")
+
+        return self._binaryop(other, "truediv", fill_value, reflect=True)
+
+    # Alias for rtruediv
+    rdiv = rtruediv
+
+    def eq(self, other, axis="columns", level=None, fill_value=None):
+        """Equal to, element-wise (binary operator eq).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.eq(right)
+        a     b     c     d
+        0  True  True  <NA>  <NA>
+        1  True  True  <NA>  <NA>
+        2  True  True  <NA>  <NA>
+        >>> left.eq(right, fill_value=7)
+        a     b      c      d
+        0  True  True   True  False
+        1  True  True  False  False
+        2  True  True  False  False
+
+        **Series**
+
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.eq(b, fill_value=2)
+        a    False
+        b    False
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """
+        return self._binaryop(
+            other=other, fn="eq", fill_value=fill_value, can_reindex=True
+        )
+
+    def ne(self, other, axis="columns", level=None, fill_value=None):
+        """Not equal to, element-wise (binary operator ne).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.ne(right)
+        a      b     c     d
+        0  False  False  <NA>  <NA>
+        1  False  False  <NA>  <NA>
+        2  False  False  <NA>  <NA>
+        >>> left.ne(right, fill_value=7)
+        a      b      c     d
+        0  False  False  False  True
+        1  False  False   True  True
+        2  False  False   True  True
+
+        **Series**
+
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.ne(b, fill_value=2)
+        a    True
+        b    True
+        c    True
+        d    True
+        e    <NA>
+        f    True
+        g    True
+        dtype: bool
+        """  # noqa: E501
+        return self._binaryop(
+            other=other, fn="ne", fill_value=fill_value, can_reindex=True
+        )
+
+    def lt(self, other, axis="columns", level=None, fill_value=None):
+        """Less than, element-wise (binary operator lt).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.lt(right)
+        a      b     c     d
+        0  False  False  <NA>  <NA>
+        1  False  False  <NA>  <NA>
+        2  False  False  <NA>  <NA>
+        >>> left.lt(right, fill_value=7)
+        a      b      c     d
+        0  False  False  False  True
+        1  False  False  False  True
+        2  False  False  False  True
 
-        # This needs to be tested correctly
-        if isinstance(other, SingleColumnFrame):
-            other = other._column
-        elif not _is_scalar_or_zero_d_array(other):
-            # Non-scalar right operands are valid iff they convert to columns.
-            try:
-                other = as_column(other)
-            except Exception:
-                return NotImplemented
+        **Series**
 
-        return {result_name: (self._column, other, reflect, fill_value)}
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.lt(b, fill_value=-10)
+        a    False
+        b     True
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
+        return self._binaryop(
+            other=other, fn="lt", fill_value=fill_value, can_reindex=True
+        )
+
+    def le(self, other, axis="columns", level=None, fill_value=None):
+        """Less than or equal, element-wise (binary operator le).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.le(right)
+        a     b     c     d
+        0  True  True  <NA>  <NA>
+        1  True  True  <NA>  <NA>
+        2  True  True  <NA>  <NA>
+        >>> left.le(right, fill_value=7)
+        a     b      c     d
+        0  True  True   True  True
+        1  True  True  False  True
+        2  True  True  False  True
+
+        **Series**
+
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.le(b, fill_value=-10)
+        a    False
+        b     True
+        c    False
+        d    False
+        e     <NA>
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
+        return self._binaryop(
+            other=other, fn="le", fill_value=fill_value, can_reindex=True
+        )
+
+    def gt(self, other, axis="columns", level=None, fill_value=None):
+        """Greater than, element-wise (binary operator gt).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.gt(right)
+        a      b     c     d
+        0  False  False  <NA>  <NA>
+        1  False  False  <NA>  <NA>
+        2  False  False  <NA>  <NA>
+        >>> left.gt(right, fill_value=7)
+        a      b      c      d
+        0  False  False  False  False
+        1  False  False   True  False
+        2  False  False   True  False
+
+        **Series**
+
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.gt(b)
+        a     True
+        b    False
+        c     True
+        d    False
+        e    False
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
+        return self._binaryop(
+            other=other, fn="gt", fill_value=fill_value, can_reindex=True
+        )
+
+    def ge(self, other, axis="columns", level=None, fill_value=None):
+        """Greater than or equal, element-wise (binary operator ge).
+
+        Parameters
+        ----------
+        other : Series or scalar value
+        fill_value : None or value
+            Value to fill nulls with before computation. If data in both
+            corresponding Series locations is null the result will be null
+
+        Returns
+        -------
+        Frame
+            The result of the operation.
+
+        Examples
+        --------
+        **DataFrame**
+
+        >>> left = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'c': [7, 8, 9]}
+        ... )
+        >>> right = cudf.DataFrame({
+        ...     'a': [1, 2, 3],
+        ...     'b': [4, 5, 6],
+        ...     'd': [10, 12, 12]}
+        ... )
+        >>> left.ge(right)
+        a     b     c     d
+        0  True  True  <NA>  <NA>
+        1  True  True  <NA>  <NA>
+        2  True  True  <NA>  <NA>
+        >>> left.ge(right, fill_value=7)
+        a     b     c      d
+        0  True  True  True  False
+        1  True  True  True  False
+        2  True  True  True  False
+
+        **Series**
+
+        >>> a = cudf.Series([1, 2, 3, None, 10, 20],
+        ...                 index=['a', 'c', 'd', 'e', 'f', 'g'])
+        >>> a
+        a       1
+        c       2
+        d       3
+        e    <NA>
+        f      10
+        g      20
+        dtype: int64
+        >>> b = cudf.Series([-10, 23, -1, None, None],
+        ...                 index=['a', 'b', 'c', 'd', 'e'])
+        >>> b
+        a     -10
+        b      23
+        c      -1
+        d    <NA>
+        e    <NA>
+        dtype: int64
+        >>> a.ge(b)
+        a     True
+        b    False
+        c     True
+        d    False
+        e    False
+        f    False
+        g    False
+        dtype: bool
+        """  # noqa: E501
+        return self._binaryop(
+            other=other, fn="ge", fill_value=fill_value, can_reindex=True
+        )
 
 
 def _get_replacement_values_for_columns(
@@ -5739,7 +7023,7 @@ def _drop_rows_by_labels(
     if isinstance(level, int) and level >= obj.index.nlevels:
         raise ValueError("Param level out of bounds.")
 
-    if not isinstance(labels, SingleColumnFrame):
+    if not isinstance(labels, cudf.core.single_column_frame.SingleColumnFrame):
         labels = as_column(labels)
 
     if isinstance(obj._index, cudf.MultiIndex):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3be71cf17a8..6ffba8da069 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -820,7 +820,7 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
-       """
+        """
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -860,9 +860,40 @@ def last(self):
         """Get the last non-null value in each group."""
         return self.agg("last")
 
-    def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
-        """Internal implementation for `ffill` and `bfill`
+    def diff(self, periods=1, axis=0):
+        """Get the difference between the values in each group.
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for calculating difference,
+            accepts negative values.
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            Take difference over rows (0) or columns (1).
+            Only row-wise (0) shift is supported.
+
+        Returns
+        -------
+        Series or DataFrame
+            First differences of the Series or DataFrame.
         """
+
+        if not axis == 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+
+        # grouped values
+        value_columns = self.grouping.values
+        _, (data, index), _ = self._groupby.groups(
+            cudf.core.frame.Frame(value_columns._data)
+        )
+        grouped = self.obj.__class__._from_data(data, index)
+        grouped = self._mimic_pandas_order(grouped)
+
+        result = grouped - self.shift(periods=periods)
+        return result._copy_type_metadata(value_columns)
+
+    def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
+        """Internal implementation for `ffill` and `bfill`"""
         value_columns = self.grouping.values
         result = self.obj.__class__._from_data(
             self._groupby.replace_nulls(
@@ -1333,8 +1364,7 @@ def _handle_by_or_level(self, by=None, level=None):
 
     @property
     def keys(self):
-        """Return grouping key columns as index
-        """
+        """Return grouping key columns as index"""
         nkeys = len(self._key_columns)
 
         if nkeys == 0:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7adb01a03bf..de463269743 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -50,7 +50,8 @@
 from cudf.core.column.column import as_column, concat_columns
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
-from cudf.core.frame import Frame, SingleColumnFrame
+from cudf.core.frame import Frame
+from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import find_common_type
 from cudf.utils.utils import cached_property, search_range
@@ -145,6 +146,8 @@ class RangeIndex(BaseIndex):
     RangeIndex(start=1, stop=10, step=1, name='a')
     """
 
+    _range: range
+
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -163,6 +166,10 @@ def __init__(
         self._step = int(step) if step is not None else 1
         self._index = None
         self._name = name
+        self._range = range(self._start, self._stop, self._step)
+        # _end is the actual last element of RangeIndex,
+        # whereas _stop is an upper bound.
+        self._end = self._start + self._step * (len(self._range) - 1)
 
     def _copy_type_metadata(
         self, other: Frame, include_index: bool = True
@@ -216,6 +223,27 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
+    def is_numeric(self):
+        return True
+
+    def is_boolean(self):
+        return False
+
+    def is_integer(self):
+        return True
+
+    def is_floating(self):
+        return False
+
+    def is_object(self):
+        return False
+
+    def is_categorical(self):
+        return False
+
+    def is_interval(self):
+        return False
+
     @property
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
@@ -312,7 +340,7 @@ def equals(self, other):
                 other._step,
             ):
                 return True
-        return cudf.Int64Index._from_data(self._data).equals(other)
+        return Int64Index._from_data(self._data).equals(other)
 
     def serialize(self):
         header = {}
@@ -476,7 +504,7 @@ def __rmul__(self, other):
     def _as_int64(self):
         # Convert self to an Int64Index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
-        return cudf.Int64Index._from_data(self._data)
+        return Int64Index._from_data(self._data)
 
     def __getattr__(self, key):
         # For methods that are not defined for RangeIndex we attempt to operate
@@ -521,6 +549,125 @@ def get_loc(self, key, method=None, tolerance=None):
             raise KeyError(key)
         return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int)
 
+    def _union(self, other, sort=None):
+        if isinstance(other, RangeIndex):
+            # Variable suffixes are of the
+            # following notation: *_o -> other, *_s -> self,
+            # and *_r -> result
+            start_s, step_s = self.start, self.step
+            end_s = self._end
+            start_o, step_o = other.start, other.step
+            end_o = other._end
+            if self.step < 0:
+                start_s, step_s, end_s = end_s, -step_s, start_s
+            if other.step < 0:
+                start_o, step_o, end_o = end_o, -step_o, start_o
+            if len(self) == 1 and len(other) == 1:
+                step_s = step_o = abs(self.start - other.start)
+            elif len(self) == 1:
+                step_s = step_o
+            elif len(other) == 1:
+                step_o = step_s
+
+            # Determine minimum start value of the result.
+            start_r = min(start_s, start_o)
+            # Determine maximum end value of the result.
+            end_r = max(end_s, end_o)
+            result = None
+            min_step = min(step_o, step_s)
+
+            if ((start_s - start_o) % min_step) == 0:
+                # Checking to determine other is a subset of self with
+                # equal step size.
+                if (
+                    step_o == step_s
+                    and (start_s - end_o) <= step_s
+                    and (start_o - end_s) <= step_s
+                ):
+                    result = type(self)(start_r, end_r + step_s, step_s)
+                # Checking if self is a subset of other with unequal
+                # step sizes.
+                elif (
+                    step_o % step_s == 0
+                    and (start_o + step_s >= start_s)
+                    and (end_o - step_s <= end_s)
+                ):
+                    result = type(self)(start_r, end_r + step_s, step_s)
+                # Checking if other is a subset of self with unequal
+                # step sizes.
+                elif (
+                    step_s % step_o == 0
+                    and (start_s + step_o >= start_o)
+                    and (end_s - step_o <= end_o)
+                ):
+                    result = type(self)(start_r, end_r + step_o, step_o)
+            # Checking to determine when the steps are even but one of
+            # the inputs spans across is near half or less then half
+            # the other input. This case needs manipulation to step
+            # size.
+            elif (
+                step_o == step_s
+                and (step_s % 2 == 0)
+                and (abs(start_s - start_o) <= step_s / 2)
+                and (abs(end_s - end_o) <= step_s / 2)
+            ):
+                result = type(self)(start_r, end_r + step_s / 2, step_s / 2)
+            if result is not None:
+                if sort is None and not result.is_monotonic_increasing:
+                    return result.sort_values()
+                else:
+                    return result
+
+        # If all the above optimizations don't cater to the inpputs,
+        # we materialize RangeIndex's into `Int64Index` and
+        # then perform `union`.
+        return Int64Index(self._values)._union(other, sort=sort)
+
+    def _intersection(self, other, sort=False):
+        if not isinstance(other, RangeIndex):
+            return super()._intersection(other, sort=sort)
+
+        if not len(self) or not len(other):
+            return RangeIndex(0)
+
+        first = self._range[::-1] if self.step < 0 else self._range
+        second = other._range[::-1] if other.step < 0 else other._range
+
+        # check whether intervals intersect
+        # deals with in- and decreasing ranges
+        int_low = max(first.start, second.start)
+        int_high = min(first.stop, second.stop)
+        if int_high <= int_low:
+            return RangeIndex(0)
+
+        # Method hint: linear Diophantine equation
+        # solve intersection problem
+        # performance hint: for identical step sizes, could use
+        # cheaper alternative
+        gcd, s, _ = _extended_gcd(first.step, second.step)
+
+        # check whether element sets intersect
+        if (first.start - second.start) % gcd:
+            return RangeIndex(0)
+
+        # calculate parameters for the RangeIndex describing the
+        # intersection disregarding the lower bounds
+        tmp_start = (
+            first.start + (second.start - first.start) * first.step // gcd * s
+        )
+        new_step = first.step * second.step // gcd
+        no_steps = -(-(int_low - tmp_start) // abs(new_step))
+        new_start = tmp_start + abs(new_step) * no_steps
+        new_range = range(new_start, int_high, new_step)
+        new_index = RangeIndex(new_range)
+
+        if (self.step < 0 and other.step < 0) is not (new_index.step < 0):
+            new_index = new_index[::-1]
+        if sort is None:
+            new_index = new_index.sort_values()
+
+        return new_index
+
 
 # Patch in all binops and unary ops, which bypass __getattr__ on the instance
 # and prevent the above overload from working.
@@ -994,6 +1141,65 @@ def find_label_range(self, first, last):
     def get_slice_bound(self, label, side, kind=None):
         return self._values.get_slice_bound(label, side, kind)
 
+    def is_numeric(self):
+        return False
+
+    def is_boolean(self):
+        return True
+
+    def is_integer(self):
+        return False
+
+    def is_floating(self):
+        return False
+
+    def is_object(self):
+        return False
+
+    def is_categorical(self):
+        return False
+
+    def is_interval(self):
+        return False
+
+    def argsort(
+        self,
+        axis=0,
+        kind="quicksort",
+        order=None,
+        ascending=True,
+        na_position="last",
+    ):
+        """Return the integer indices that would sort the Series values.
+
+        Parameters
+        ----------
+        axis : {0 or "index"}
+            Has no effect but is accepted for compatibility with numpy.
+        kind : {'mergesort', 'quicksort', 'heapsort', 'stable'}, default 'quicksort'
+            Choice of sorting algorithm. See :func:`numpy.sort` for more
+            information. 'mergesort' and 'stable' are the only stable
+            algorithms. Only quicksort is supported in cuDF.
+        order : None
+            Has no effect but is accepted for compatibility with numpy.
+        ascending : bool or list of bool, default True
+            If True, sort values in ascending order, otherwise descending.
+        na_position : {‘first’ or ‘last’}, default ‘last’
+            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
+            at the end.
+
+        Returns
+        -------
+        cupy.ndarray: The indices sorted based on input.
+        """  # noqa: E501
+        return super().argsort(
+            axis=axis,
+            kind=kind,
+            order=order,
+            ascending=ascending,
+            na_position=na_position,
+        )
+
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.
@@ -1029,6 +1235,27 @@ def __init__(self, data=None, dtype=None, copy=False, name=None):
 
         super().__init__(data, **kwargs)
 
+    def is_numeric(self):
+        return True
+
+    def is_boolean(self):
+        return False
+
+    def is_integer(self):
+        return True
+
+    def is_floating(self):
+        return False
+
+    def is_object(self):
+        return False
+
+    def is_categorical(self):
+        return False
+
+    def is_interval(self):
+        return False
+
 
 class Int8Index(NumericIndex):
     """
@@ -1254,6 +1481,12 @@ class Float32Index(NumericIndex):
 
     _dtype = np.float32
 
+    def is_integer(self):
+        return False
+
+    def is_floating(self):
+        return True
+
 
 class Float64Index(NumericIndex):
     """
@@ -1279,6 +1512,12 @@ class Float64Index(NumericIndex):
 
     _dtype = np.float64
 
+    def is_integer(self):
+        return False
+
+    def is_floating(self):
+        return True
+
 
 class DatetimeIndex(GenericIndex):
     """
@@ -1654,6 +1893,9 @@ def _get_dt_field(self, field):
         )
         return as_index(out_column, name=self.name)
 
+    def is_boolean(self):
+        return False
+
 
 class TimedeltaIndex(GenericIndex):
     """
@@ -1782,6 +2024,9 @@ def inferred_freq(self):
         """
         raise NotImplementedError("inferred_freq is not yet supported")
 
+    def is_boolean(self):
+        return False
+
 
 class CategoricalIndex(GenericIndex):
     """
@@ -1894,6 +2139,12 @@ def categories(self):
         """
         return as_index(self._values.categories)
 
+    def is_boolean(self):
+        return False
+
+    def is_categorical(self):
+        return True
+
 
 def interval_range(
     start=None, end=None, periods=None, freq=None, name=None, closed="right",
@@ -2025,7 +2276,7 @@ def interval_range(
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
-        return cudf.IntervalIndex(data, closed=closed)
+        return IntervalIndex(data, closed=closed)
 
     interval_col = column.build_interval_column(
         left_col, right_col, closed=closed
@@ -2122,6 +2373,12 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None):
 
         return IntervalIndex(interval_col, name=name)
 
+    def is_interval(self):
+        return True
+
+    def is_boolean(self):
+        return False
+
 
 class StringIndex(GenericIndex):
     """String defined indices into another Column
@@ -2152,9 +2409,6 @@ def to_pandas(self):
             self.to_numpy(na_value=None), name=self.name, dtype="object"
         )
 
-    def take(self, indices):
-        return self._values[indices]
-
     def __repr__(self):
         return (
             f"{self.__class__.__name__}({self._values.to_array()},"
@@ -2182,6 +2436,12 @@ def _clean_nulls_from_index(self):
         else:
             return self
 
+    def is_boolean(self):
+        return False
+
+    def is_object(self):
+        return True
+
 
 def as_index(arbitrary, **kwargs) -> BaseIndex:
     """Create an Index from an arbitrary object
@@ -2370,3 +2630,21 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
 
     stop = non_empty_indexes[-1].stop if next_ is None else next_
     return RangeIndex(start, stop, step)
+
+
+def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
+    """
+    Extended Euclidean algorithms to solve Bezout's identity:
+       a*x + b*y = gcd(x, y)
+    Finds one particular solution for x, y: s, t
+    Returns: gcd, s, t
+    """
+    s, old_s = 0, 1
+    t, old_t = 1, 0
+    r, old_r = b, a
+    while r:
+        quotient = old_r // r
+        old_r, r = r, old_r - quotient * r
+        old_s, s = s, old_s - quotient * s
+        old_t, t = t, old_t - quotient * t
+    return old_r, old_s, old_t
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
new file mode 100644
index 00000000000..68088cb275e
--- /dev/null
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -0,0 +1,528 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+"""Base class for Frame types that have an index."""
+
+from __future__ import annotations
+
+import warnings
+from typing import Type, TypeVar
+
+import cupy as cp
+import pandas as pd
+from nvtx import annotate
+
+import cudf
+from cudf.api.types import is_categorical_dtype, is_list_like
+from cudf.core.frame import Frame
+from cudf.core.index import Index
+from cudf.core.multiindex import MultiIndex
+from cudf.utils.utils import cached_property
+
+
+def _indices_from_labels(obj, labels):
+    from cudf.core.column import column
+
+    if not isinstance(labels, cudf.MultiIndex):
+        labels = column.as_column(labels)
+
+        if is_categorical_dtype(obj.index):
+            labels = labels.astype("category")
+            codes = labels.codes.astype(obj.index._values.codes.dtype)
+            labels = column.build_categorical_column(
+                categories=labels.dtype.categories,
+                codes=codes,
+                ordered=labels.dtype.ordered,
+            )
+        else:
+            labels = labels.astype(obj.index.dtype)
+
+    # join is not guaranteed to maintain the index ordering
+    # so we will sort it with its initial ordering which is stored
+    # in column "__"
+    lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels)
+    rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index)
+    return lhs.join(rhs).sort_values("__")["_"]
+
+
+def _get_label_range_or_mask(index, start, stop, step):
+    if (
+        not (start is None and stop is None)
+        and type(index) is cudf.core.index.DatetimeIndex
+        and index.is_monotonic is False
+    ):
+        start = pd.to_datetime(start)
+        stop = pd.to_datetime(stop)
+        if start is not None and stop is not None:
+            if start > stop:
+                return slice(0, 0, None)
+            # TODO: Once Index binary ops are updated to support logical_and,
+            # can use that instead of using cupy.
+            boolean_mask = cp.logical_and((index >= start), (index <= stop))
+        elif start is not None:
+            boolean_mask = index >= start
+        else:
+            boolean_mask = index <= stop
+        return boolean_mask
+    else:
+        start, stop = index.find_label_range(start, stop)
+        return slice(start, stop, step)
+
+
+class _FrameIndexer:
+    """Parent class for indexers."""
+
+    def __init__(self, frame):
+        self._frame = frame
+
+
+_LocIndexerClass = TypeVar("_LocIndexerClass", bound="_FrameIndexer")
+_IlocIndexerClass = TypeVar("_IlocIndexerClass", bound="_FrameIndexer")
+
+
+class IndexedFrame(Frame):
+    """A frame containing an index.
+
+    This class encodes the common behaviors for core user-facing classes like
+    DataFrame and Series that consist of a sequence of columns along with a
+    special set of index columns.
+
+    Parameters
+    ----------
+    data : dict
+        An dict mapping column names to Columns
+    index : Table
+        A Frame representing the (optional) index columns.
+    """
+
+    # mypy can't handle bound type variables as class members
+    _loc_indexer_type: Type[_LocIndexerClass]  # type: ignore
+    _iloc_indexer_type: Type[_IlocIndexerClass]  # type: ignore
+
+    def __init__(self, data=None, index=None):
+        super().__init__(data=data, index=index)
+
+    def to_dict(self, *args, **kwargs):  # noqa: D102
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via `to_dict()` method. Consider using "
+            "`.to_pandas().to_dict()` to construct a Python dictionary."
+        )
+
+    @property
+    def index(self):
+        """Get the labels for the rows."""
+        return self._index
+
+    @index.setter
+    def index(self, value):
+        old_length = len(self)
+        new_length = len(value)
+
+        # A DataFrame with 0 columns can have an index of arbitrary length.
+        if len(self._data) > 0 and new_length != old_length:
+            raise ValueError(
+                f"Length mismatch: Expected axis has {old_length} elements, "
+                f"new values have {len(value)} elements"
+            )
+        self._index = Index(value)
+
+    @cached_property
+    def loc(self):
+        """Select rows and columns by label or boolean mask.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> series = cudf.Series([10, 11, 12], index=['a', 'b', 'c'])
+        >>> series
+        a    10
+        b    11
+        c    12
+        dtype: int64
+        >>> series.loc['b']
+        11
+
+        **DataFrame**
+
+        DataFrame with string index.
+
+        >>> df
+           a  b
+        a  0  5
+        b  1  6
+        c  2  7
+        d  3  8
+        e  4  9
+
+        Select a single row by label.
+
+        >>> df.loc['a']
+        a    0
+        b    5
+        Name: a, dtype: int64
+
+        Select multiple rows and a single column.
+
+        >>> df.loc[['a', 'c', 'e'], 'b']
+        a    5
+        c    7
+        e    9
+        Name: b, dtype: int64
+
+        Selection by boolean mask.
+
+        >>> df.loc[df.a > 2]
+           a  b
+        d  3  8
+        e  4  9
+
+        Setting values using loc.
+
+        >>> df.loc[['a', 'c', 'e'], 'a'] = 0
+        >>> df
+           a  b
+        a  0  5
+        b  1  6
+        c  0  7
+        d  3  8
+        e  0  9
+
+        """
+        return self._loc_indexer_type(self)
+
+    @cached_property
+    def iloc(self):
+        """Select values by position.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> s = cudf.Series([10, 20, 30])
+        >>> s
+        0    10
+        1    20
+        2    30
+        dtype: int64
+        >>> s.iloc[2]
+        30
+
+        **DataFrame**
+
+        Selecting rows and column by position.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame({'a': range(20),
+        ...                      'b': range(20),
+        ...                      'c': range(20)})
+
+        Select a single row using an integer index.
+
+        >>> df.iloc[1]
+        a    1
+        b    1
+        c    1
+        Name: 1, dtype: int64
+
+        Select multiple rows using a list of integers.
+
+        >>> df.iloc[[0, 2, 9, 18]]
+              a    b    c
+         0    0    0    0
+         2    2    2    2
+         9    9    9    9
+        18   18   18   18
+
+        Select rows using a slice.
+
+        >>> df.iloc[3:10:2]
+             a    b    c
+        3    3    3    3
+        5    5    5    5
+        7    7    7    7
+        9    9    9    9
+
+        Select both rows and columns.
+
+        >>> df.iloc[[1, 3, 5, 7], 2]
+        1    1
+        3    3
+        5    5
+        7    7
+        Name: c, dtype: int64
+
+        Setting values in a column using iloc.
+
+        >>> df.iloc[:4] = 0
+        >>> df
+           a  b  c
+        0  0  0  0
+        1  0  0  0
+        2  0  0  0
+        3  0  0  0
+        4  4  4  4
+        5  5  5  5
+        6  6  6  6
+        7  7  7  7
+        8  8  8  8
+        9  9  9  9
+        [10 more rows]
+
+        """
+        return self._iloc_indexer_type(self)
+
+    @annotate("SORT_INDEX", color="red", domain="cudf_python")
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending=True,
+        inplace=False,
+        kind=None,
+        na_position="last",
+        sort_remaining=True,
+        ignore_index=False,
+        key=None,
+    ):
+        """Sort object by labels (along an axis).
+
+        Parameters
+        ----------
+        axis : {0 or ‘index’, 1 or ‘columns’}, default 0
+            The axis along which to sort. The value 0 identifies the rows,
+            and 1 identifies the columns.
+        level : int or level name or list of ints or list of level names
+            If not None, sort on values in specified index level(s).
+            This is only useful in the case of MultiIndex.
+        ascending : bool, default True
+            Sort ascending vs. descending.
+        inplace : bool, default False
+            If True, perform operation in-place.
+        kind : sorting method such as `quick sort` and others.
+            Not yet supported.
+        na_position : {‘first’, ‘last’}, default ‘last’
+            Puts NaNs at the beginning if first; last puts NaNs at the end.
+        sort_remaining : bool, default True
+            Not yet supported
+        ignore_index : bool, default False
+            if True, index will be replaced with RangeIndex.
+        key : callable, optional
+            If not None, apply the key function to the index values before
+            sorting. This is similar to the key argument in the builtin
+            sorted() function, with the notable difference that this key
+            function should be vectorized. It should expect an Index and return
+            an Index of the same shape. For MultiIndex inputs, the key is
+            applied per level.
+
+        Returns
+        -------
+        Frame or None
+
+        Notes
+        -----
+        Difference from pandas:
+          * Not supporting: kind, sort_remaining=False
+
+        Examples
+        --------
+        **Series**
+        >>> import cudf
+        >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4])
+        >>> series
+        3    a
+        2    b
+        1    c
+        4    d
+        dtype: object
+        >>> series.sort_index()
+        1    c
+        2    b
+        3    a
+        4    d
+        dtype: object
+
+        Sort Descending
+
+        >>> series.sort_index(ascending=False)
+        4    d
+        3    a
+        2    b
+        1    c
+        dtype: object
+
+        **DataFrame**
+        >>> df = cudf.DataFrame(
+        ... {"b":[3, 2, 1], "a":[2, 1, 3]}, index=[1, 3, 2])
+        >>> df.sort_index(axis=0)
+           b  a
+        1  3  2
+        2  1  3
+        3  2  1
+        >>> df.sort_index(axis=1)
+           a  b
+        1  2  3
+        3  1  2
+        2  3  1
+        """
+        if kind is not None:
+            raise NotImplementedError("kind is not yet supported")
+
+        if not sort_remaining:
+            raise NotImplementedError(
+                "sort_remaining == False is not yet supported"
+            )
+
+        if key is not None:
+            raise NotImplementedError("key is not yet supported.")
+
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
+
+        if axis in (0, "index"):
+            idx = self.index
+            if isinstance(idx, MultiIndex):
+                if level is not None:
+                    # Pandas doesn't handle na_position in case of MultiIndex.
+                    na_position = "first" if ascending is True else "last"
+                    labels = [
+                        idx._get_level_label(lvl)
+                        for lvl in (level if is_list_like(level) else (level,))
+                    ]
+                    # Explicitly construct a Frame rather than using type(self)
+                    # to avoid constructing a SingleColumnFrame (e.g. Series).
+                    idx = Frame._from_data(idx._data.select_by_label(labels))
+
+                inds = idx._get_sorted_inds(
+                    ascending=ascending, na_position=na_position
+                )
+                # TODO: This line is abusing the fact that take accepts a
+                # column, not just user-facing objects. We will want to
+                # refactor that in the future.
+                out = self.take(inds)
+            elif (ascending and idx.is_monotonic_increasing) or (
+                not ascending and idx.is_monotonic_decreasing
+            ):
+                out = self.copy()
+            else:
+                inds = idx.argsort(
+                    ascending=ascending, na_position=na_position
+                )
+                out = self.take(inds)
+        else:
+            labels = sorted(self._data.names, reverse=not ascending)
+            out = self[labels]
+
+        if ignore_index is True:
+            out = out.reset_index(drop=True)
+        return self._mimic_inplace(out, inplace=inplace)
+
+    def sort_values(
+        self,
+        by,
+        axis=0,
+        ascending=True,
+        inplace=False,
+        kind="quicksort",
+        na_position="last",
+        ignore_index=False,
+    ):
+        """Sort by the values along either axis.
+
+        Parameters
+        ----------
+        by : str or list of str
+            Name or list of names to sort by.
+        ascending : bool or list of bool, default True
+            Sort ascending vs. descending. Specify list for multiple sort
+            orders. If this is a list of bools, must match the length of the
+            by.
+        na_position : {‘first’, ‘last’}, default ‘last’
+            'first' puts nulls at the beginning, 'last' puts nulls at the end
+        ignore_index : bool, default False
+            If True, index will not be sorted.
+
+        Returns
+        -------
+        Frame : Frame with sorted values.
+
+        Notes
+        -----
+        Difference from pandas:
+          * Support axis='index' only.
+          * Not supporting: inplace, kind
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame()
+        >>> df['a'] = [0, 1, 2]
+        >>> df['b'] = [-3, 2, 0]
+        >>> df.sort_values('b')
+           a  b
+        0  0 -3
+        2  2  0
+        1  1  2
+        """
+        if na_position not in {"first", "last"}:
+            raise ValueError(f"invalid na_position: {na_position}")
+        if inplace:
+            raise NotImplementedError("`inplace` not currently implemented.")
+        if kind != "quicksort":
+            if kind not in {"mergesort", "heapsort", "stable"}:
+                raise AttributeError(
+                    f"{kind} is not a valid sorting algorithm for "
+                    f"'DataFrame' object"
+                )
+            warnings.warn(
+                f"GPU-accelerated {kind} is currently not supported, "
+                f"defaulting to quicksort."
+            )
+        if axis != 0:
+            raise NotImplementedError("`axis` not currently implemented.")
+
+        if len(self) == 0:
+            return self
+
+        # argsort the `by` column
+        return self.take(
+            self._get_columns_by_label(by)._get_sorted_inds(
+                ascending=ascending, na_position=na_position
+            ),
+            keep_index=not ignore_index,
+        )
+
+    def _n_largest_or_smallest(self, largest, n, columns, keep):
+        # Get column to operate on
+        if isinstance(columns, str):
+            columns = [columns]
+
+        if len(self) == 0:
+            return self
+
+        if keep == "first":
+            if n < 0:
+                n = 0
+
+            # argsort the `by` column
+            return self.take(
+                self._get_columns_by_label(columns)._get_sorted_inds(
+                    ascending=not largest
+                )[:n],
+                keep_index=True,
+            )
+        elif keep == "last":
+            indices = self._get_columns_by_label(columns)._get_sorted_inds(
+                ascending=largest
+            )
+
+            if n <= 0:
+                # Empty slice.
+                indices = indices[0:0]
+            else:
+                indices = indices[: -n - 1 : -1]
+            return self.take(indices, keep_index=True)
+        else:
+            raise ValueError('keep must be either "first", "last"')
diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
deleted file mode 100755
index d4955f1ac6c..00000000000
--- a/python/cudf/cudf/core/indexing.py
+++ /dev/null
@@ -1,581 +0,0 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
-
-from typing import Any, Union
-
-import cupy as cp
-import numpy as np
-import pandas as pd
-from nvtx import annotate
-
-import cudf
-from cudf._lib.concat import concat_columns
-from cudf._lib.scalar import _is_null_host_scalar
-from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    _is_scalar_or_zero_d_array,
-    is_bool_dtype,
-    is_categorical_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_numeric_dtype,
-    is_scalar,
-)
-from cudf.core.column.column import as_column
-from cudf.utils.dtypes import (
-    find_common_type,
-    is_column_like,
-    to_cudf_compatible_scalar,
-)
-
-
-def indices_from_labels(obj, labels):
-    from cudf.core.column import column
-
-    if not isinstance(labels, cudf.MultiIndex):
-        labels = column.as_column(labels)
-
-        if is_categorical_dtype(obj.index):
-            labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index._values.codes.dtype)
-            labels = column.build_categorical_column(
-                categories=labels.dtype.categories,
-                codes=codes,
-                ordered=labels.dtype.ordered,
-            )
-        else:
-            labels = labels.astype(obj.index.dtype)
-
-    # join is not guaranteed to maintain the index ordering
-    # so we will sort it with its initial ordering which is stored
-    # in column "__"
-    lhs = cudf.DataFrame({"__": column.arange(len(labels))}, index=labels)
-    rhs = cudf.DataFrame({"_": column.arange(len(obj))}, index=obj.index)
-    return lhs.join(rhs).sort_values("__")["_"]
-
-
-def get_label_range_or_mask(index, start, stop, step):
-    if (
-        not (start is None and stop is None)
-        and type(index) is cudf.core.index.DatetimeIndex
-        and index.is_monotonic is False
-    ):
-        start = pd.to_datetime(start)
-        stop = pd.to_datetime(stop)
-        if start is not None and stop is not None:
-            if start > stop:
-                return slice(0, 0, None)
-            # TODO: Once Index binary ops are updated to support logical_and,
-            # can use that instead of using cupy.
-            boolean_mask = cp.logical_and((index >= start), (index <= stop))
-        elif start is not None:
-            boolean_mask = index >= start
-        else:
-            boolean_mask = index <= stop
-        return boolean_mask
-    else:
-        start, stop = index.find_label_range(start, stop)
-        return slice(start, stop, step)
-
-
-class _SeriesIlocIndexer(object):
-    """
-    For integer-location based selection.
-    """
-
-    def __init__(self, sr):
-        self._sr = sr
-
-    def __getitem__(self, arg):
-        if isinstance(arg, tuple):
-            arg = list(arg)
-        data = self._sr._column[arg]
-
-        if (
-            isinstance(data, (dict, list))
-            or _is_scalar_or_zero_d_array(data)
-            or _is_null_host_scalar(data)
-        ):
-            return data
-        return self._sr._from_data(
-            {self._sr.name: data}, index=cudf.Index(self._sr.index.take(arg))
-        )
-
-    def __setitem__(self, key, value):
-        from cudf.core.column import column
-
-        if isinstance(key, tuple):
-            key = list(key)
-
-        # coerce value into a scalar or column
-        if is_scalar(value):
-            value = to_cudf_compatible_scalar(value)
-        elif not (
-            isinstance(value, (list, dict))
-            and isinstance(
-                self._sr._column.dtype, (cudf.ListDtype, cudf.StructDtype)
-            )
-        ):
-            value = column.as_column(value)
-
-        if (
-            not isinstance(
-                self._sr._column.dtype,
-                (cudf.Decimal64Dtype, cudf.CategoricalDtype),
-            )
-            and hasattr(value, "dtype")
-            and _is_non_decimal_numeric_dtype(value.dtype)
-        ):
-            # normalize types if necessary:
-            if not is_integer(key):
-                to_dtype = np.result_type(value.dtype, self._sr._column.dtype)
-                value = value.astype(to_dtype)
-                self._sr._column._mimic_inplace(
-                    self._sr._column.astype(to_dtype), inplace=True
-                )
-
-        self._sr._column[key] = value
-
-
-class _SeriesLocIndexer(object):
-    """
-    Label-based selection
-    """
-
-    def __init__(self, sr):
-        self._sr = sr
-
-    def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
-        if isinstance(arg, pd.MultiIndex):
-            arg = cudf.from_pandas(arg)
-
-        if isinstance(self._sr.index, cudf.MultiIndex) and not isinstance(
-            arg, cudf.MultiIndex
-        ):
-            result = self._sr.index._get_row_major(self._sr, arg)
-            if (
-                isinstance(arg, tuple)
-                and len(arg) == self._sr._index.nlevels
-                and not any((isinstance(x, slice) for x in arg))
-            ):
-                result = result.iloc[0]
-            return result
-        try:
-            arg = self._loc_to_iloc(arg)
-        except (TypeError, KeyError, IndexError, ValueError):
-            raise KeyError(arg)
-
-        return self._sr.iloc[arg]
-
-    def __setitem__(self, key, value):
-        try:
-            key = self._loc_to_iloc(key)
-        except KeyError as e:
-            if (
-                is_scalar(key)
-                and not isinstance(self._sr.index, cudf.MultiIndex)
-                and is_scalar(value)
-            ):
-                _append_new_row_inplace(self._sr.index._values, key)
-                _append_new_row_inplace(self._sr._column, value)
-                return
-            else:
-                raise e
-        if isinstance(value, (pd.Series, cudf.Series)):
-            value = cudf.Series(value)
-            value = value._align_to_index(self._sr.index, how="right")
-        self._sr.iloc[key] = value
-
-    def _loc_to_iloc(self, arg):
-        if _is_scalar_or_zero_d_array(arg):
-            if not _is_non_decimal_numeric_dtype(self._sr.index.dtype):
-                # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
-                    found_index = arg.value
-                    return found_index
-                elif is_integer(arg):
-                    found_index = arg
-                    return found_index
-            try:
-                found_index = self._sr.index._values.find_first_value(
-                    arg, closest=False
-                )
-                return found_index
-            except (TypeError, KeyError, IndexError, ValueError):
-                raise KeyError("label scalar is out of bound")
-
-        elif isinstance(arg, slice):
-            return get_label_range_or_mask(
-                self._sr.index, arg.start, arg.stop, arg.step
-            )
-        elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)):
-            if isinstance(arg, pd.MultiIndex):
-                arg = cudf.MultiIndex.from_pandas(arg)
-
-            return indices_from_labels(self._sr, arg)
-
-        else:
-            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
-            if arg.dtype in (bool, np.bool_):
-                return arg
-            else:
-                indices = indices_from_labels(self._sr, arg)
-                if indices.null_count > 0:
-                    raise KeyError("label scalar is out of bound")
-                return indices
-
-
-class _DataFrameIndexer(object):
-    def __getitem__(self, arg):
-        from cudf import MultiIndex
-
-        if isinstance(self._df.index, MultiIndex) or isinstance(
-            self._df.columns, MultiIndex
-        ):
-            # This try/except block allows the use of pandas-like
-            # tuple arguments into MultiIndex dataframes.
-            try:
-                return self._getitem_tuple_arg(arg)
-            except (TypeError, KeyError, IndexError, ValueError):
-                return self._getitem_tuple_arg((arg, slice(None)))
-        else:
-            if not isinstance(arg, tuple):
-                arg = (arg, slice(None))
-            return self._getitem_tuple_arg(arg)
-
-    def __setitem__(self, key, value):
-        if not isinstance(key, tuple):
-            key = (key, slice(None))
-        return self._setitem_tuple_arg(key, value)
-
-    def _can_downcast_to_series(self, df, arg):
-        """
-        This method encapsulates the logic used
-        to determine whether or not the result of a loc/iloc
-        operation should be "downcasted" from a DataFrame to a
-        Series
-        """
-        from cudf.core.column import as_column
-
-        if isinstance(df, cudf.Series):
-            return False
-        nrows, ncols = df.shape
-        if nrows == 1:
-            if type(arg[0]) is slice:
-                if not is_scalar(arg[1]):
-                    return False
-            elif (is_list_like(arg[0]) or is_column_like(arg[0])) and (
-                is_list_like(arg[1])
-                or is_column_like(arg[0])
-                or type(arg[1]) is slice
-            ):
-                return False
-            else:
-                if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance(
-                    arg[1], slice
-                ):
-                    return True
-            dtypes = df.dtypes.values.tolist()
-            all_numeric = all([is_numeric_dtype(t) for t in dtypes])
-            if all_numeric:
-                return True
-        if ncols == 1:
-            if type(arg[1]) is slice:
-                return False
-            if isinstance(arg[1], tuple):
-                # Multiindex indexing with a slice
-                if any(isinstance(v, slice) for v in arg):
-                    return False
-            if not (is_list_like(arg[1]) or is_column_like(arg[1])):
-                return True
-        return False
-
-    def _downcast_to_series(self, df, arg):
-        """
-        "Downcast" from a DataFrame to a Series
-        based on Pandas indexing rules
-        """
-        nrows, ncols = df.shape
-        # determine the axis along which the Series is taken:
-        if nrows == 1 and ncols == 1:
-            if is_scalar(arg[0]) and is_scalar(arg[1]):
-                return df[df.columns[0]].iloc[0]
-            elif not is_scalar(arg[0]):
-                axis = 1
-            else:
-                axis = 0
-
-        elif nrows == 1:
-            axis = 0
-        elif ncols == 1:
-            axis = 1
-        else:
-            raise ValueError("Cannot downcast DataFrame selection to Series")
-
-        # take series along the axis:
-        if axis == 1:
-            return df[df._data.names[0]]
-        else:
-            df = _normalize_dtypes(df)
-            sr = df.T
-            return sr[sr._data.names[0]]
-
-
-class _DataFrameLocIndexer(_DataFrameIndexer):
-    """
-    For selection by label.
-    """
-
-    def __init__(self, df):
-        self._df = df
-
-    def _getitem_scalar(self, arg):
-        return self._df[arg[1]].loc[arg[0]]
-
-    @annotate("LOC_GETITEM", color="blue", domain="cudf_python")
-    def _getitem_tuple_arg(self, arg):
-        from uuid import uuid4
-
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.dataframe import DataFrame
-        from cudf.core.index import as_index
-
-        # Step 1: Gather columns
-        if isinstance(arg, tuple):
-            columns_df = self._get_column_selection(arg[1])
-            columns_df._index = self._df._index
-        else:
-            columns_df = self._df
-
-        # Step 2: Gather rows
-        if isinstance(columns_df.index, MultiIndex):
-            if isinstance(arg, (MultiIndex, pd.MultiIndex)):
-                if isinstance(arg, pd.MultiIndex):
-                    arg = MultiIndex.from_pandas(arg)
-
-                indices = indices_from_labels(columns_df, arg)
-                return columns_df.take(indices)
-
-            else:
-                if isinstance(arg, tuple):
-                    return columns_df.index._get_row_major(columns_df, arg[0])
-                else:
-                    return columns_df.index._get_row_major(columns_df, arg)
-        else:
-            if isinstance(arg[0], slice):
-                out = get_label_range_or_mask(
-                    columns_df.index, arg[0].start, arg[0].stop, arg[0].step
-                )
-                if isinstance(out, slice):
-                    df = columns_df._slice(out)
-                else:
-                    df = columns_df._apply_boolean_mask(out)
-            else:
-                tmp_arg = arg
-                if is_scalar(arg[0]):
-                    # If a scalar, there is possibility of having duplicates.
-                    # Join would get all the duplicates. So, coverting it to
-                    # an array kind.
-                    tmp_arg = ([tmp_arg[0]], tmp_arg[1])
-                if len(tmp_arg[0]) == 0:
-                    return columns_df._empty_like(keep_index=True)
-                tmp_arg = (column.as_column(tmp_arg[0]), tmp_arg[1])
-
-                if is_bool_dtype(tmp_arg[0]):
-                    df = columns_df._apply_boolean_mask(tmp_arg[0])
-                else:
-                    tmp_col_name = str(uuid4())
-                    other_df = DataFrame(
-                        {tmp_col_name: column.arange(len(tmp_arg[0]))},
-                        index=as_index(tmp_arg[0]),
-                    )
-                    df = other_df.join(columns_df, how="inner")
-                    # as join is not assigning any names to index,
-                    # update it over here
-                    df.index.name = columns_df.index.name
-                    df = df.sort_values(tmp_col_name)
-                    df.drop(columns=[tmp_col_name], inplace=True)
-                    # There were no indices found
-                    if len(df) == 0:
-                        raise KeyError(arg)
-
-        # Step 3: Gather index
-        if df.shape[0] == 1:  # we have a single row
-            if isinstance(arg[0], slice):
-                start = arg[0].start
-                if start is None:
-                    start = self._df.index[0]
-                df.index = as_index(start)
-            else:
-                row_selection = column.as_column(arg[0])
-                if is_bool_dtype(row_selection.dtype):
-                    df.index = self._df.index.take(row_selection)
-                else:
-                    df.index = as_index(row_selection)
-        # Step 4: Downcast
-        if self._can_downcast_to_series(df, arg):
-            return self._downcast_to_series(df, arg)
-        return df
-
-    @annotate("LOC_SETITEM", color="blue", domain="cudf_python")
-    def _setitem_tuple_arg(self, key, value):
-        if isinstance(self._df.index, cudf.MultiIndex) or isinstance(
-            self._df.columns, pd.MultiIndex
-        ):
-            raise NotImplementedError(
-                "Setting values using df.loc[] not supported on "
-                "DataFrames with a MultiIndex"
-            )
-
-        try:
-            columns_df = self._get_column_selection(key[1])
-        except KeyError:
-            if not self._df.empty and isinstance(key[0], slice):
-                pos_range = get_label_range_or_mask(
-                    self._df.index, key[0].start, key[0].stop, key[0].step
-                )
-                idx = self._df.index[pos_range]
-            elif self._df.empty and isinstance(key[0], slice):
-                idx = None
-            else:
-                idx = cudf.Index(key[0])
-            if is_scalar(value):
-                length = len(idx) if idx is not None else 1
-                value = as_column(value, length=length)
-
-            new_col = cudf.Series(value, index=idx)
-            if not self._df.empty:
-                new_col = new_col._align_to_index(self._df.index, how="right")
-
-            if self._df.empty:
-                self._df.index = (
-                    idx if idx is not None else cudf.RangeIndex(len(new_col))
-                )
-            self._df._data.insert(key[1], new_col)
-        else:
-            if isinstance(value, (cp.ndarray, np.ndarray)):
-                value_df = cudf.DataFrame(value)
-                if value_df.shape[1] != columns_df.shape[1]:
-                    if value_df.shape[1] == 1:
-                        value_cols = (
-                            value_df._data.columns * columns_df.shape[1]
-                        )
-                    else:
-                        raise ValueError(
-                            f"shape mismatch: value array of shape "
-                            f"{value_df.shape} could not be "
-                            f"broadcast to indexing result of shape "
-                            f"{columns_df.shape}"
-                        )
-                else:
-                    value_cols = value_df._data.columns
-                for i, col in enumerate(columns_df._column_names):
-                    self._df[col].loc[key[0]] = value_cols[i]
-            else:
-                for col in columns_df._column_names:
-                    self._df[col].loc[key[0]] = value
-
-    def _get_column_selection(self, arg):
-        return self._df._get_columns_by_label(arg)
-
-
-class _DataFrameIlocIndexer(_DataFrameIndexer):
-    """
-    For selection by index.
-    """
-
-    def __init__(self, df):
-        self._df = df
-
-    @annotate("ILOC_GETITEM", color="blue", domain="cudf_python")
-    def _getitem_tuple_arg(self, arg):
-        from cudf import MultiIndex
-        from cudf.core.column import column
-        from cudf.core.index import as_index
-
-        # Iloc Step 1:
-        # Gather the columns specified by the second tuple arg
-        columns_df = self._get_column_selection(arg[1])
-        columns_df._index = self._df._index
-
-        # Iloc Step 2:
-        # Gather the rows specified by the first tuple arg
-        if isinstance(columns_df.index, MultiIndex):
-            if isinstance(arg[0], slice):
-                df = columns_df[arg[0]]
-            else:
-                df = columns_df.index._get_row_major(columns_df, arg[0])
-            if (len(df) == 1 and len(columns_df) >= 1) and not (
-                isinstance(arg[0], slice) or isinstance(arg[1], slice)
-            ):
-                # Pandas returns a numpy scalar in this case
-                return df.iloc[0]
-            if self._can_downcast_to_series(df, arg):
-                return self._downcast_to_series(df, arg)
-            return df
-        else:
-            if isinstance(arg[0], slice):
-                df = columns_df._slice(arg[0])
-            elif is_scalar(arg[0]):
-                index = arg[0]
-                if index < 0:
-                    index += len(columns_df)
-                df = columns_df._slice(slice(index, index + 1, 1))
-            else:
-                arg = (column.as_column(arg[0]), arg[1])
-                if is_bool_dtype(arg[0]):
-                    df = columns_df._apply_boolean_mask(arg[0])
-                else:
-                    df = columns_df._gather(arg[0])
-
-        # Iloc Step 3:
-        # Reindex
-        if df.shape[0] == 1:  # we have a single row without an index
-            df.index = as_index(self._df.index[arg[0]])
-
-        # Iloc Step 4:
-        # Downcast
-        if self._can_downcast_to_series(df, arg):
-            return self._downcast_to_series(df, arg)
-
-        if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice):
-            df._index = as_index(self._df.index[arg[0]])
-        return df
-
-    @annotate("ILOC_SETITEM", color="blue", domain="cudf_python")
-    def _setitem_tuple_arg(self, key, value):
-        columns = self._get_column_selection(key[1])
-
-        for col in columns:
-            self._df[col].iloc[key[0]] = value
-
-    def _getitem_scalar(self, arg):
-        col = self._df.columns[arg[1]]
-        return self._df[col].iloc[arg[0]]
-
-    def _get_column_selection(self, arg):
-        return cudf.DataFrame(self._df._get_columns_by_index(arg))
-
-
-def _normalize_dtypes(df):
-    if df._num_columns > 0:
-        dtypes = df.dtypes.values.tolist()
-        normalized_dtype = np.result_type(*dtypes)
-        for name, col in df._data.items():
-            df[name] = col.astype(normalized_dtype)
-    return df
-
-
-def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
-    """Append a scalar `value` to the end of `col` inplace.
-       Cast to common type if possible
-    """
-    to_type = find_common_type([type(value), col.dtype])
-    val_col = as_column(value, dtype=to_type)
-    old_col = col.astype(to_type)
-
-    col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e1274dc7758..7c132e3fb71 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -656,8 +656,7 @@ def _compute_levels_and_codes(self):
         self._codes = cudf.DataFrame._from_data(codes)
 
     def _compute_validity_mask(self, index, row_tuple, max_length):
-        """ Computes the valid set of indices of values in the lookup
-        """
+        """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
         for name, row in zip(index.names, row_tuple):
             if isinstance(row, slice) and row == slice(None):
@@ -836,22 +835,11 @@ def size(self):
         return self._num_rows
 
     def take(self, indices):
-        if isinstance(indices, (Integral, Sequence)):
-            indices = np.array(indices)
-        elif isinstance(indices, cudf.Series) and indices.has_nulls:
+        if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
-        elif isinstance(indices, slice):
-            start, stop, step = indices.indices(len(self))
-            indices = column.arange(start, stop, step)
-        result = MultiIndex.from_frame(
-            self.to_frame(index=False).take(indices)
-        )
-        if self._codes is not None:
-            result._codes = self._codes.take(indices)
-        if self._levels is not None:
-            result._levels = self._levels
-        result.names = self.names
-        return result
+        obj = super().take(indices)
+        obj.names = self.names
+        return obj
 
     def serialize(self):
         header, frames = super().serialize()
@@ -888,11 +876,26 @@ def deserialize(cls, header, frames):
         return obj._set_names(column_names)
 
     def __getitem__(self, index):
-        if isinstance(index, int):
-            # we are indexing into a single row of the MultiIndex,
-            # return that row as a tuple:
-            return self.take(index).to_pandas()[0]
-        return self.take(index)
+        flatten = isinstance(index, int)
+
+        if isinstance(index, (Integral, Sequence)):
+            index = np.array(index)
+        elif isinstance(index, slice):
+            start, stop, step = index.indices(len(self))
+            index = column.arange(start, stop, step)
+        result = MultiIndex.from_frame(self.to_frame(index=False).take(index))
+
+        # we are indexing into a single row of the MultiIndex,
+        # return that row as a tuple:
+        if flatten:
+            return result.to_pandas()[0]
+
+        if self._codes is not None:
+            result._codes = self._codes.take(index)
+        if self._levels is not None:
+            result._levels = self._levels
+        result.names = self.names
+        return result
 
     def to_frame(self, index=True, name=None):
         # TODO: Currently this function makes a shallow copy, which is
@@ -941,6 +944,27 @@ def get_level_values(self, level):
         level_values = as_index(self._data[level], name=self.names[level_idx])
         return level_values
 
+    def is_numeric(self):
+        return False
+
+    def is_boolean(self):
+        return False
+
+    def is_integer(self):
+        return False
+
+    def is_floating(self):
+        return False
+
+    def is_object(self):
+        return False
+
+    def is_categorical(self):
+        return False
+
+    def is_interval(self):
+        return False
+
     @classmethod
     def _concat(cls, objs):
 
@@ -1344,23 +1368,6 @@ def is_monotonic_decreasing(self):
             ascending=[False] * len(self.levels), null_position=None
         )
 
-    def argsort(self, ascending=True, **kwargs):
-        return self._get_sorted_inds(ascending=ascending, **kwargs).values
-
-    def sort_values(self, return_indexer=False, ascending=True, key=None):
-        if key is not None:
-            raise NotImplementedError("key parameter is not yet implemented.")
-
-        indices = cudf.Series._from_data(
-            {None: self._get_sorted_inds(ascending=ascending)}
-        )
-        index_sorted = as_index(self.take(indices), name=self.names)
-
-        if return_indexer:
-            return index_sorted, cupy.asarray(indices)
-        else:
-            return index_sorted
-
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1649,3 +1656,74 @@ def get_loc(self, key, method=None, tolerance=None):
         mask = cupy.full(self._data.nrows, False)
         mask[true_inds] = True
         return mask
+
+    def _get_reconciled_name_object(self, other) -> MultiIndex:
+        """
+        If the result of a set operation will be self,
+        return self, unless the names change, in which
+        case make a shallow copy of self.
+        """
+        names = self._maybe_match_names(other)
+        if self.names != names:
+            return self.rename(names)
+        return self
+
+    def _maybe_match_names(self, other):
+        """
+        Try to find common names to attach to the result of an operation
+        between a and b. Return a consensus list of names if they match
+        at least partly or list of None if they have completely
+        different names.
+        """
+        if len(self.names) != len(other.names):
+            return [None] * len(self.names)
+        return [
+            self_name if self_name == other_name else None
+            for self_name, other_name in zip(self.names, other.names)
+        ]
+
+    def _union(self, other, sort=None):
+        # TODO: When to_frame is refactored to return a
+        # deep copy in future, we should push most of the common
+        # logic between MultiIndex._union & BaseIndex._union into
+        # GenericIndex._union.
+        other_df = other.copy(deep=True).to_frame(index=False)
+        self_df = self.copy(deep=True).to_frame(index=False)
+        col_names = list(range(0, self.nlevels))
+        self_df.columns = col_names
+        other_df.columns = col_names
+        self_df["order"] = self_df.index
+        other_df["order"] = other_df.index
+
+        result_df = self_df.merge(other_df, on=col_names, how="outer")
+        result_df = result_df.sort_values(
+            by=result_df.columns[self.nlevels :], ignore_index=True
+        )
+
+        midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
+        midx.names = self.names if self.names == other.names else None
+        if sort is None and len(other):
+            return midx.sort_values()
+        return midx
+
+    def _intersection(self, other, sort=None):
+        if self.names != other.names:
+            deep = True
+            col_names = list(range(0, self.nlevels))
+            res_name = (None,) * self.nlevels
+        else:
+            deep = False
+            col_names = None
+            res_name = self.names
+
+        other_df = other.copy(deep=deep).to_frame(index=False)
+        self_df = self.copy(deep=deep).to_frame(index=False)
+        if col_names is not None:
+            other_df.columns = col_names
+            self_df.columns = col_names
+
+        result_df = cudf.merge(self_df, other_df, how="inner")
+        midx = self.__class__.from_frame(result_df, names=res_name)
+        if sort is None and len(other):
+            return midx.sort_values()
+        return midx
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a2155deb51e..b2fac7a6140 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,16 +1,21 @@
 # Copyright (c) 2018-2021, NVIDIA CORPORATION.
 
 import itertools
+from typing import Dict, Optional
 
 import numpy as np
 import pandas as pd
 
 import cudf
+from cudf._lib.transform import one_hot_encode
+from cudf._typing import Dtype
+from cudf.core.column import ColumnBase, as_column, column_empty_like
+from cudf.core.column.categorical import CategoricalColumn
 
 _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
 
 
-def _align_objs(objs, how="outer"):
+def _align_objs(objs, how="outer", sort=None):
     """Align a set of Series or Dataframe objects.
 
     Parameters
@@ -18,16 +23,18 @@ def _align_objs(objs, how="outer"):
     objs : list of DataFrame, Series, or Index
     how : How to handle indexes on other axis (or axes),
     similar to join in concat
+    sort : Whether to sort the resulting Index
     Returns
     -------
-    A bool for if indexes have matched and a set of
-    reindexed and aligned objects ready for concatenation
+    A list of reindexed and aligned objects
+    ready for concatenation
     """
     # Check if multiindex then check if indexes match. GenericIndex
     # returns ndarray tuple of bools requiring additional filter.
     # Then check for duplicate index value.
     i_objs = iter(objs)
     first = next(i_objs)
+
     not_matching_index = any(
         not first.index.equals(rest.index) for rest in i_objs
     )
@@ -38,36 +45,50 @@ def _align_objs(objs, how="outer"):
 
         index = objs[0].index
         name = index.name
-        if how == "inner" or isinstance(index, cudf.MultiIndex):
-            for obj in objs[1:]:
-                index = (
-                    cudf.DataFrame(index=obj.index)
-                    .join(cudf.DataFrame(index=index), how=how)
-                    .index
-                )
-                index.name = name
-            return [obj.reindex(index) for obj in objs], False
 
-        else:
-            all_index_objs = [obj.index for obj in objs]
-            appended_index = all_index_objs[0].append(all_index_objs[1:])
-            df = cudf.DataFrame(
-                {
-                    "idxs": appended_index,
-                    "order": cudf.core.column.arange(
-                        start=0, stop=len(appended_index)
-                    ),
-                }
-            )
-            df = df.drop_duplicates(subset=["idxs"]).sort_values(
-                by=["order"], ascending=True
-            )
-            final_index = df["idxs"]
-            final_index.name = name
+        final_index = _get_combined_index(
+            [obj.index for obj in objs], intersect=how == "inner", sort=sort
+        )
 
-            return [obj.reindex(final_index) for obj in objs], False
+        final_index.name = name
+        return [
+            obj.reindex(final_index)
+            if not final_index.equals(obj.index)
+            else obj
+            for obj in objs
+        ]
+    else:
+        if sort:
+            if not first.index.is_monotonic_increasing:
+                final_index = first.index.sort_values()
+                return [obj.reindex(final_index) for obj in objs]
+        return objs
+
+
+def _get_combined_index(indexes, intersect: bool = False, sort=None):
+    if len(indexes) == 0:
+        index = cudf.Index([])
+    elif len(indexes) == 1:
+        index = indexes[0]
+    elif intersect:
+        sort = True
+        index = indexes[0]
+        for other in indexes[1:]:
+            # Don't sort for every intersection,
+            # let the sorting happen in the end.
+            index = index.intersection(other, sort=False)
     else:
-        return objs, True
+        index = indexes[0]
+        if sort is None:
+            sort = False if isinstance(index, cudf.StringIndex) else True
+        for other in indexes[1:]:
+            index = index.union(other, sort=False)
+
+    if sort:
+        if not index.is_monotonic_increasing:
+            index = index.sort_values()
+
+    return index
 
 
 def _normalize_series_and_dataframe(objs, axis):
@@ -202,7 +223,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     0      a       1    bird   polly
     1      b       2  monkey  george
     """
-
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
@@ -286,9 +306,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         if len(objs) == 0:
             return df
 
-        objs, match_index = _align_objs(objs, how=join)
+        # Don't need to align indices of all `objs` since we
+        # would anyway return an empty dataframe below
+        if not empty_inner:
+            objs = _align_objs(objs, how=join, sort=sort)
+            df.index = objs[0].index
 
-        df.index = objs[0].index
         for o in objs:
             for name, col in o._data.items():
                 if name in df._data:
@@ -297,7 +320,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                         f"doesn't support having multiple columns with "
                         f"same names yet."
                     )
-                df[name] = col
+                if empty_inner:
+                    # if join is inner and it contains an empty df
+                    # we return an empty df, hence creating an empty
+                    # column with dtype metadata retained.
+                    df[name] = cudf.core.column.column_empty_like(
+                        col, newsize=0
+                    )
+                else:
+                    df[name] = col
 
         result_columns = objs[0].columns.append(
             [obj.columns for obj in objs[1:]]
@@ -314,20 +345,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             # we return an empty df
             return df.head(0)
 
-        # This check uses `sort is not False` rather than just `sort=True`
-        # to differentiate between a user-provided `False` value and the
-        # default `None`. This is necessary for pandas compatibility, even
-        # though `True` and `False` are the only valid options from the user.
-        if not match_index and sort is not False:
-            return df.sort_index()
-
-        if sort or join == "inner":
-            # when join='outer' and sort=False string indexes
-            # are returned unsorted. Everything else seems
-            # to be returned sorted when axis = 1
-            return df.sort_index()
-        else:
-            return df
+        return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
     typ = list(typs)[0]
@@ -499,7 +517,7 @@ def melt(
     dtypes = [frame[col].dtype for col in id_vars + value_vars]
     if any(cudf.api.types.is_categorical_dtype(t) for t in dtypes):
         raise NotImplementedError(
-            "Categorical columns are not yet " "supported for function"
+            "Categorical columns are not yet supported for function"
         )
 
     # Check dtype homogeneity in value_var
@@ -574,7 +592,7 @@ def get_dummies(
     drop_first=False,
     dtype="uint8",
 ):
-    """ Returns a dataframe whose columns are the one hot encodings of all
+    """Returns a dataframe whose columns are the one hot encodings of all
     columns in `df`
 
     Parameters
@@ -654,6 +672,7 @@ def get_dummies(
     3     0  0  1  0
     4     0  0  0  1
     """
+
     if cats is None:
         cats = {}
     if sparse:
@@ -693,43 +712,40 @@ def get_dummies(
         if len(columns) == 0:
             return df.select_dtypes(exclude=encode_fallback_dtypes)
         else:
-            result_df = df.copy(deep=False)
-            result_df.drop(columns=columns, inplace=True)
+            result_data = {
+                col_name: col
+                for col_name, col in df._data.items()
+                if col_name not in columns
+            }
 
             for name in columns:
-                unique = _get_unique(column=df._data[name], dummy_na=dummy_na)
+                if name not in cats:
+                    unique = _get_unique(
+                        column=df._data[name], dummy_na=dummy_na
+                    )
+                else:
+                    unique = as_column(cats[name])
 
-                col_enc_df = df.one_hot_encoding(
-                    name,
+                col_enc_data = _one_hot_encode_column(
+                    column=df._data[name],
+                    categories=unique,
                     prefix=prefix_map.get(name, prefix),
-                    cats=cats.get(name, unique),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
                 )
-                for col in col_enc_df.columns.difference(df._data.names):
-                    result_df[col] = col_enc_df._data[col]
-
-            return result_df
+                result_data.update(col_enc_data)
+            return cudf.DataFrame._from_data(result_data, index=df._index)
     else:
         ser = cudf.Series(df)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
-
-        if hasattr(unique, "to_arrow"):
-            cats = unique.to_arrow().to_pylist()
-        else:
-            cats = pd.Series(unique, dtype="object")
-
-        col_names = ["null" if cat is None else cat for cat in cats]
-
-        if prefix is not None:
-            col_names = [f"{prefix}{prefix_sep}{cat}" for cat in col_names]
-
-        newcols = ser.one_hot_encoding(cats=cats, dtype=dtype)
-        result_df = cudf.DataFrame(index=ser.index)
-        for i, col in enumerate(newcols):
-            result_df._data[col_names[i]] = col
-
-        return result_df
+        data = _one_hot_encode_column(
+            column=ser._column,
+            categories=unique,
+            prefix=prefix,
+            prefix_sep=prefix_sep,
+            dtype=dtype,
+        )
+        return cudf.DataFrame._from_data(data, index=ser._index)
 
 
 def merge_sorted(
@@ -1066,6 +1082,38 @@ def _get_unique(column, dummy_na):
     return unique
 
 
+def _one_hot_encode_column(
+    column: ColumnBase,
+    categories: ColumnBase,
+    prefix: Optional[str],
+    prefix_sep: Optional[str],
+    dtype: Optional[Dtype],
+) -> Dict[str, ColumnBase]:
+    """Encode a single column with one hot encoding. The return dictionary
+    contains pairs of (category, encodings). The keys may be prefixed with
+    `prefix`, separated with category name with `prefix_sep`. The encoding
+    columns maybe coerced into `dtype`.
+    """
+    if isinstance(column, CategoricalColumn):
+        if column.size == column.null_count:
+            column = column_empty_like(categories, newsize=column.size)
+        else:
+            column = column._get_decategorized_column()
+
+    if column.size * categories.size >= np.iinfo("int32").max:
+        raise ValueError(
+            "Size limitation exceeded: column.size * category.size < "
+            "np.iinfo('int32').max. Consider reducing size of category"
+        )
+    data = one_hot_encode(column, categories)
+
+    if prefix is not None and prefix_sep is not None:
+        data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
+    if dtype:
+        data = {k: v.astype(dtype) for k, v in data.items()}
+    return data
+
+
 def _length_check_params(obj, columns, name):
     if cudf.api.types.is_list_like(obj):
         if len(obj) != len(columns):
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f425b650ee7..787b28e213c 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core.column.column import ColumnBase
-from cudf.core.dtypes import Decimal64Dtype, ListDtype, StructDtype
+from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.index import BaseIndex
 from cudf.core.series import Series
 from cudf.utils.dtypes import (
@@ -145,12 +145,12 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 return NA, dtype
 
-        if isinstance(dtype, Decimal64Dtype):
+        if isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             value = pa.scalar(
                 value, type=pa.decimal128(dtype.precision, dtype.scale)
             ).as_py()
         if isinstance(value, decimal.Decimal) and dtype is None:
-            dtype = Decimal64Dtype._from_decimal(value)
+            dtype = cudf.Decimal64Dtype._from_decimal(value)
 
         value = to_cudf_compatible_scalar(value, dtype=dtype)
 
@@ -171,7 +171,7 @@ def _preprocess_host_value(self, value, dtype):
             else:
                 dtype = value.dtype
 
-        if not isinstance(dtype, Decimal64Dtype):
+        if not isinstance(dtype, (cudf.Decimal64Dtype, cudf.Decimal32Dtype)):
             dtype = cudf.dtype(dtype)
 
         if not valid:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8188290c392..2dd70b336e9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2,28 +2,38 @@
 
 from __future__ import annotations
 
+import functools
+import inspect
 import pickle
 import warnings
 from collections import abc as abc
+from hashlib import sha256
 from numbers import Number
 from shutil import get_terminal_size
-from typing import Any, MutableMapping, Optional, Set
+from typing import Any, MutableMapping, Optional, Set, Union
 from uuid import uuid4
 
 import cupy
 import numpy as np
 import pandas as pd
+from numba import cuda
 from pandas._config import get_option
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.scalar import _is_null_host_scalar
 from cudf._lib.transform import bools_to_mask
+from cudf._typing import ColumnLike, DataFrameOrSeries, ScalarLike
 from cudf.api.types import (
+    _is_non_decimal_numeric_dtype,
+    _is_scalar_or_zero_d_array,
     is_bool_dtype,
     is_categorical_dtype,
     is_decimal_dtype,
     is_dict_like,
     is_dtype_equal,
+    is_integer,
+    is_integer_dtype,
     is_interval_dtype,
     is_list_dtype,
     is_list_like,
@@ -48,10 +58,16 @@
 from cudf.core.column.string import StringMethods
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.core.frame import Frame, SingleColumnFrame, _drop_rows_by_labels
+from cudf.core.frame import Frame, _drop_rows_by_labels
 from cudf.core.groupby.groupby import SeriesGroupBy
 from cudf.core.index import BaseIndex, RangeIndex, as_index
-from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer
+from cudf.core.indexed_frame import (
+    IndexedFrame,
+    _FrameIndexer,
+    _get_label_range_or_mask,
+    _indices_from_labels,
+)
+from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils import cudautils, docutils
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
@@ -63,10 +79,171 @@
 from cudf.utils.utils import (
     get_appropriate_dispatched_func,
     get_relevant_submodule,
+    to_cudf_compatible_scalar,
 )
 
 
-class Series(SingleColumnFrame, Serializable):
+def _append_new_row_inplace(col: ColumnLike, value: ScalarLike):
+    """Append a scalar `value` to the end of `col` inplace.
+    Cast to common type if possible
+    """
+    to_type = find_common_type([type(value), col.dtype])
+    val_col = as_column(value, dtype=to_type)
+    old_col = col.astype(to_type)
+
+    col._mimic_inplace(concat_columns([old_col, val_col]), inplace=True)
+
+
+class _SeriesIlocIndexer(_FrameIndexer):
+    """
+    For integer-location based selection.
+    """
+
+    def __getitem__(self, arg):
+        if isinstance(arg, tuple):
+            arg = list(arg)
+        data = self._frame._column[arg]
+
+        if (
+            isinstance(data, (dict, list))
+            or _is_scalar_or_zero_d_array(data)
+            or _is_null_host_scalar(data)
+        ):
+            return data
+        return self._frame._from_data(
+            {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]),
+        )
+
+    def __setitem__(self, key, value):
+        from cudf.core.column import column
+
+        if isinstance(key, tuple):
+            key = list(key)
+
+        # coerce value into a scalar or column
+        if is_scalar(value):
+            value = to_cudf_compatible_scalar(value)
+        elif not (
+            isinstance(value, (list, dict))
+            and isinstance(
+                self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
+            )
+        ):
+            value = column.as_column(value)
+
+        if (
+            not isinstance(
+                self._frame._column.dtype,
+                (
+                    cudf.Decimal64Dtype,
+                    cudf.Decimal32Dtype,
+                    cudf.CategoricalDtype,
+                ),
+            )
+            and hasattr(value, "dtype")
+            and _is_non_decimal_numeric_dtype(value.dtype)
+        ):
+            # normalize types if necessary:
+            if not is_integer(key):
+                to_dtype = np.result_type(
+                    value.dtype, self._frame._column.dtype
+                )
+                value = value.astype(to_dtype)
+                self._frame._column._mimic_inplace(
+                    self._frame._column.astype(to_dtype), inplace=True
+                )
+
+        self._frame._column[key] = value
+
+
+class _SeriesLocIndexer(_FrameIndexer):
+    """
+    Label-based selection
+    """
+
+    def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
+        if isinstance(arg, pd.MultiIndex):
+            arg = cudf.from_pandas(arg)
+
+        if isinstance(self._frame.index, cudf.MultiIndex) and not isinstance(
+            arg, cudf.MultiIndex
+        ):
+            result = self._frame.index._get_row_major(self._frame, arg)
+            if (
+                isinstance(arg, tuple)
+                and len(arg) == self._frame._index.nlevels
+                and not any((isinstance(x, slice) for x in arg))
+            ):
+                result = result.iloc[0]
+            return result
+        try:
+            arg = self._loc_to_iloc(arg)
+        except (TypeError, KeyError, IndexError, ValueError):
+            raise KeyError(arg)
+
+        return self._frame.iloc[arg]
+
+    def __setitem__(self, key, value):
+        try:
+            key = self._loc_to_iloc(key)
+        except KeyError as e:
+            if (
+                is_scalar(key)
+                and not isinstance(self._frame.index, cudf.MultiIndex)
+                and is_scalar(value)
+            ):
+                _append_new_row_inplace(self._frame.index._values, key)
+                _append_new_row_inplace(self._frame._column, value)
+                return
+            else:
+                raise e
+        if isinstance(value, (pd.Series, cudf.Series)):
+            value = cudf.Series(value)
+            value = value._align_to_index(self._frame.index, how="right")
+        self._frame.iloc[key] = value
+
+    def _loc_to_iloc(self, arg):
+        if _is_scalar_or_zero_d_array(arg):
+            if not _is_non_decimal_numeric_dtype(self._frame.index.dtype):
+                # TODO: switch to cudf.utils.dtypes.is_integer(arg)
+                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
+                    arg.dtype
+                ):
+                    found_index = arg.value
+                    return found_index
+                elif is_integer(arg):
+                    found_index = arg
+                    return found_index
+            try:
+                found_index = self._frame.index._values.find_first_value(
+                    arg, closest=False
+                )
+                return found_index
+            except (TypeError, KeyError, IndexError, ValueError):
+                raise KeyError("label scalar is out of bound")
+
+        elif isinstance(arg, slice):
+            return _get_label_range_or_mask(
+                self._frame.index, arg.start, arg.stop, arg.step
+            )
+        elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)):
+            if isinstance(arg, pd.MultiIndex):
+                arg = cudf.MultiIndex.from_pandas(arg)
+
+            return _indices_from_labels(self._frame, arg)
+
+        else:
+            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
+            if arg.dtype in (bool, np.bool_):
+                return arg
+            else:
+                indices = _indices_from_labels(self._frame, arg)
+                if indices.null_count > 0:
+                    raise KeyError("label scalar is out of bound")
+                return indices
+
+
+class Series(SingleColumnFrame, IndexedFrame, Serializable):
     """
     One-dimensional GPU array (including time series).
 
@@ -110,6 +287,8 @@ class Series(SingleColumnFrame, Serializable):
     """
 
     _accessors: Set[Any] = set()
+    _loc_indexer_type = _SeriesLocIndexer
+    _iloc_indexer_type = _SeriesIlocIndexer
 
     # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
@@ -552,7 +731,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False):
 
         See Also
         --------
-        cudf.core.reshape.concat : General function to concatenate DataFrame or
+        cudf.concat : General function to concatenate DataFrame or
             Series objects.
 
         Examples
@@ -742,6 +921,10 @@ def set_index(self, index):
         e    14
         dtype: int64
         """
+        warnings.warn(
+            "Series.set_index is deprecated and will be removed in the future",
+            FutureWarning,
+        )
         index = index if isinstance(index, BaseIndex) else as_index(index)
         return self._from_data(self._data, index, self.name)
 
@@ -980,64 +1163,16 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
-    def to_dict(self, into=dict):
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `to_dict()` method. Consider using "
-            "`.to_pandas().to_dict()` to construct a Python dictionary."
-        )
-
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
         else:
             self.loc[key] = value
 
-    def take(self, indices, keep_index=True):
-        """
-        Return Series by taking values from the corresponding *indices*.
-
-        Parameters
-        ----------
-        indices : array-like or scalar
-            An array/scalar like integers indicating which positions to take.
-        keep_index : bool, default True
-            Whethere to retain the index in result Series or not.
-
-        Returns
-        -------
-        Series
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12, 13, 14])
-        >>> series
-        0    10
-        1    11
-        2    12
-        3    13
-        4    14
-        dtype: int64
-        >>> series.take([0, 4])
-        0    10
-        4    14
-        dtype: int64
-
-        If you want to drop the index, pass `keep_index=False`
-
-        >>> series.take([0, 4], keep_index=False)
-        0    10
-        1    14
-        dtype: int64
-        """
-        if keep_index is True or is_scalar(indices):
-            return self.iloc[indices]
-        else:
-            col_inds = as_column(indices)
-            return self._from_data(
-                {self.name: self._column.take(col_inds, keep_index=False)}
-            )
+    def take(self, indices, axis=0, keep_index=True):
+        # Validate but don't use the axis.
+        _ = self._get_axis_from_axis_arg(axis)
+        return super().take(indices, keep_index)
 
     def __repr__(self):
         _, height = get_terminal_size()
@@ -1153,1018 +1288,57 @@ def __repr__(self):
 
     def _binaryop(
         self,
-        other: Frame,
-        fn: str,
-        fill_value: Any = None,
-        reflect: bool = False,
-        can_reindex: bool = False,
-        *args,
-        **kwargs,
-    ):
-        # Specialize binops to align indices.
-        if isinstance(other, SingleColumnFrame):
-            if (
-                # TODO: The can_reindex logic also needs to be applied for
-                # DataFrame (the methods that need it just don't exist yet).
-                not can_reindex
-                and fn in cudf.utils.utils._EQUALITY_OPS
-                and (
-                    isinstance(other, Series)
-                    # TODO: mypy doesn't like this line because the index
-                    # property is not defined on SingleColumnFrame (or Index,
-                    # for that matter). Ignoring is the easy solution for now,
-                    # a cleaner fix requires reworking the type hierarchy.
-                    and not self.index.equals(other.index)  # type: ignore
-                )
-            ):
-                raise ValueError(
-                    "Can only compare identically-labeled " "Series objects"
-                )
-            lhs, other = _align_indices([self, other], allow_non_unique=True)
-        else:
-            lhs = self
-
-        operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return (
-            lhs._from_data(
-                data=lhs._colwise_binop(operands, fn), index=lhs._index,
-            )
-            if operands is not NotImplemented
-            else NotImplemented
-        )
-
-    def add(self, other, fill_value=None, axis=0):
-        """
-        Addition of series and other, element-wise
-        (binary operator add).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the addition.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       1
-        c       1
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       1
-        e    <NA>
-        dtype: int64
-        >>> a.add(b)
-        a       2
-        b    <NA>
-        c    <NA>
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.add(b, fill_value=0)
-        a       2
-        b       1
-        c       1
-        d       1
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "add", fill_value)
-
-    def radd(self, other, fill_value=None, axis=0):
-        """Addition of series and other, element-wise
-        (binary operator radd).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       1
-        e    <NA>
-        dtype: int64
-        >>> a.add(b, fill_value=0)
-        a       2
-        b       2
-        c       3
-        d       1
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other, "add", fill_value=fill_value, reflect=True
-        )
-
-    def subtract(self, other, fill_value=None, axis=0):
-        """Subtraction of series and other, element-wise
-        (binary operator sub).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, 30], index=['a', 'c', 'b', 'd'])
-        >>> b
-        a       1
-        c    <NA>
-        b       2
-        d      30
-        dtype: int64
-        >>> a.subtract(b, fill_value=2)
-        a       9
-        b      18
-        c    <NA>
-        d       0
-        e    <NA>
-        dtype: int64
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "sub", fill_value)
-
-    sub = subtract
-
-    def rsub(self, other, fill_value=None, axis=0):
-        """Subtraction of series and other, element-wise
-        (binary operator rsub).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.rsub(b, fill_value=10)
-        a       0
-        b       8
-        c       7
-        d      -8
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "sub", fill_value, reflect=True)
-
-    def multiply(self, other, fill_value=None, axis=0):
-        """Multiplication of series and other, element-wise
-        (binary operator mul).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.multiply(b, fill_value=0)
-        a       1
-        b       0
-        c       0
-        d       0
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "mul", fill_value=fill_value)
-
-    mul = multiply
-
-    def rmul(self, other, fill_value=None, axis=0):
-        """Multiplication of series and other, element-wise
-        (binary operator rmul).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e      40
-        dtype: int64
-        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
-        >>> b
-        a    <NA>
-        b       1
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        >>> a.rmul(b, fill_value=2)
-        a      20
-        b      20
-        c    <NA>
-        d     600
-        e     200
-        f       8
-        dtype: int64
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "mul", fill_value, True)
-
-    def mod(self, other, fill_value=None, axis=0):
-        """Modulo of series and other, element-wise
-        (binary operator mod).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 20, 30])
-        >>> series
-        0    10
-        1    20
-        2    30
-        dtype: int64
-        >>> series.mod(4)
-        0    2
-        1    0
-        2    2
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "mod", fill_value)
-
-    def rmod(self, other, fill_value=None, axis=0):
-        """Modulo of series and other, element-wise
-        (binary operator rmod).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        e      40
-        dtype: int64
-        >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f'])
-        >>> b
-        a    <NA>
-        b       1
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        >>> a.rmod(b, fill_value=10)
-        a       0
-        b       1
-        c    <NA>
-        d      20
-        e       5
-        f       4
-        dtype: int64
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "mod", fill_value, True)
-
-    def pow(self, other, fill_value=None, axis=0):
-        """Exponential power of series and other, element-wise
-        (binary operator pow).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a      10
-        b    <NA>
-        d      12
-        e    <NA>
-        dtype: int64
-        >>> a.pow(b, fill_value=0)
-        a       1
-        b       1
-        c       1
-        d       0
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "pow", fill_value)
-
-    def rpow(self, other, fill_value=None, axis=0):
-        """Exponential power of series and other, element-wise
-        (binary operator rpow).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       2
-        c       3
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a      10
-        b    <NA>
-        d      12
-        e    <NA>
-        dtype: int64
-        >>> a.rpow(b, fill_value=0)
-        a      10
-        b       0
-        c       0
-        d       1
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "pow", fill_value, True)
-
-    def floordiv(self, other, fill_value=None, axis=0):
-        """Integer division of series and other, element-wise
-        (binary operator floordiv).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b       1
-        c       1
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       1
-        e    <NA>
-        dtype: int64
-        >>> a.floordiv(b)
-        a       1
-        b    <NA>
-        c    <NA>
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "floordiv", fill_value)
-
-    def rfloordiv(self, other, fill_value=None, axis=0):
-        """Integer division of series and other, element-wise
-        (binary operator rfloordiv).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            Result of the arithmetic operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([1, 2, 10, 17])
-        >>> s
-        0     1
-        1     2
-        2    10
-        3    17
-        dtype: int64
-        >>> s.rfloordiv(100)
-        0    100
-        1     50
-        2     10
-        3      5
-        dtype: int64
-        >>> s = cudf.Series([10, 20, None])
-        >>> s
-        0      10
-        1      20
-        2    <NA>
-        dtype: int64
-        >>> s.rfloordiv(200)
-        0      20
-        1      10
-        2    <NA>
-        dtype: int64
-        >>> s.rfloordiv(200, fill_value=2)
-        0     20
-        1     10
-        2    100
-        dtype: int64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "floordiv", fill_value, True)
-
-    def truediv(self, other, fill_value=None, axis=0):
-        """Floating division of series and other, element-wise
-        (binary operator truediv).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The reuslt of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 10, 20, None], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a       1
-        b      10
-        c      20
-        d    <NA>
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e    <NA>
-        dtype: int64
-        >>> a.truediv(b, fill_value=0)
-        a     1.0
-        b     Inf
-        c     Inf
-        d     0.0
-        e    <NA>
-        dtype: float64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "truediv", fill_value)
-
-    def rtruediv(self, other, fill_value=None, axis=0):
-        """Floating division of series and other, element-wise
-        (binary operator rtruediv).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([10, 20, None, 30], index=['a', 'b', 'c', 'd'])
-        >>> a
-        a      10
-        b      20
-        c    <NA>
-        d      30
-        dtype: int64
-        >>> b = cudf.Series([1, None, 2, 3], index=['a', 'b', 'd', 'e'])
-        >>> b
-        a       1
-        b    <NA>
-        d       2
-        e       3
-        dtype: int64
-        >>> a.rtruediv(b, fill_value=0)
-        a            0.1
-        b            0.0
-        c           <NA>
-        d    0.066666667
-        e            Inf
-        dtype: float64
-        """
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(other, "truediv", fill_value, True)
-
-    def logical_and(self, other):
-        return self._binaryop(other, "l_and").astype(np.bool_)
-
-    def remainder(self, other):
-        return self._binaryop(other, "mod")
-
-    def logical_or(self, other):
-        return self._binaryop(other, "l_or").astype(np.bool_)
-
-    def logical_not(self):
-        return self._unaryop("not")
-
-    def eq(self, other, fill_value=None, axis=0):
-        """Equal to of series and other, element-wise
-        (binary operator eq).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.eq(b, fill_value=2)
-        a    False
-        b    False
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="eq", fill_value=fill_value, can_reindex=True
-        )
-
-    def ne(self, other, fill_value=None, axis=0):
-        """Not equal to of series and other, element-wise
-        (binary operator ne).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.ne(b, fill_value=2)
-        a    True
-        b    True
-        c    True
-        d    True
-        e    <NA>
-        f    True
-        g    True
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="ne", fill_value=fill_value, can_reindex=True
-        )
-
-    def lt(self, other, fill_value=None, axis=0):
-        """Less than of series and other, element-wise
-        (binary operator lt).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.lt(b, fill_value=-10)
-        a    False
-        b     True
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="lt", fill_value=fill_value, can_reindex=True
-        )
-
-    def le(self, other, fill_value=None, axis=0):
-        """Less than or equal to of series and other, element-wise
-        (binary operator le).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.le(b, fill_value=-10)
-        a    False
-        b     True
-        c    False
-        d    False
-        e     <NA>
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="le", fill_value=fill_value, can_reindex=True
-        )
-
-    def gt(self, other, fill_value=None, axis=0):
-        """Greater than of series and other, element-wise
-        (binary operator gt).
-
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
-
-        Returns
-        -------
-        Series
-            The result of the operation.
+        other: Frame,
+        fn: str,
+        fill_value: Any = None,
+        reflect: bool = False,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ):
+        # Specialize binops to align indices.
+        if isinstance(other, SingleColumnFrame):
+            if (
+                # TODO: The can_reindex logic also needs to be applied for
+                # DataFrame (the methods that need it just don't exist yet).
+                not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
+                and (
+                    isinstance(other, Series)
+                    # TODO: mypy doesn't like this line because the index
+                    # property is not defined on SingleColumnFrame (or Index,
+                    # for that matter). Ignoring is the easy solution for now,
+                    # a cleaner fix requires reworking the type hierarchy.
+                    and not self.index.equals(other.index)  # type: ignore
+                )
+            ):
+                raise ValueError(
+                    "Can only compare identically-labeled Series objects"
+                )
+            lhs, other = _align_indices([self, other], allow_non_unique=True)
+        else:
+            lhs = self
 
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.gt(b)
-        a     True
-        b    False
-        c     True
-        d    False
-        e    False
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="gt", fill_value=fill_value, can_reindex=True
+        operands = lhs._make_operands_for_binop(other, fill_value, reflect)
+        return (
+            lhs._from_data(
+                data=lhs._colwise_binop(operands, fn), index=lhs._index,
+            )
+            if operands is not NotImplemented
+            else NotImplemented
         )
 
-    def ge(self, other, fill_value=None, axis=0):
-        """Greater than or equal to of series and other, element-wise
-        (binary operator ge).
+    def logical_and(self, other):
+        return self._binaryop(other, "l_and").astype(np.bool_)
 
-        Parameters
-        ----------
-        other : Series or scalar value
-        fill_value : None or value
-            Value to fill nulls with before computation. If data in both
-            corresponding Series locations is null the result will be null
+    def remainder(self, other):
+        return self._binaryop(other, "mod")
 
-        Returns
-        -------
-        Series
-            The result of the operation.
+    def logical_or(self, other):
+        return self._binaryop(other, "l_or").astype(np.bool_)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> a = cudf.Series([1, 2, 3, None, 10, 20], index=['a', 'c', 'd', 'e', 'f', 'g'])
-        >>> a
-        a       1
-        c       2
-        d       3
-        e    <NA>
-        f      10
-        g      20
-        dtype: int64
-        >>> b = cudf.Series([-10, 23, -1, None, None], index=['a', 'b', 'c', 'd', 'e'])
-        >>> b
-        a     -10
-        b      23
-        c      -1
-        d    <NA>
-        e    <NA>
-        dtype: int64
-        >>> a.ge(b)
-        a     True
-        b    False
-        c     True
-        d    False
-        e    False
-        f    False
-        g    False
-        dtype: bool
-        """  # noqa: E501
-        if axis != 0:
-            raise NotImplementedError("Only axis=0 supported at this time.")
-        return self._binaryop(
-            other=other, fn="ge", fill_value=fill_value, can_reindex=True
-        )
+    def logical_not(self):
+        return self._unaryop("not")
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
@@ -2484,7 +1658,7 @@ def to_array(self, fillna=None):
         warnings.warn(
             "The to_array method will be removed in a future cuDF "
             "release. Consider using `to_numpy` instead.",
-            DeprecationWarning,
+            FutureWarning,
         )
         return self._column.to_array(fillna=fillna)
 
@@ -2591,66 +1765,9 @@ def data(self):
         """  # noqa: E501
         return self._column.data
 
-    @property
-    def index(self):
-        """The index object
-        """
-        return self._index
-
-    @index.setter
-    def index(self, _index):
-        self._index = as_index(_index)
-
-    @property
-    def loc(self):
-        """
-        Select values by label.
-
-        See also
-        --------
-        cudf.DataFrame.loc
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12], index=['a', 'b', 'c'])
-        >>> series
-        a    10
-        b    11
-        c    12
-        dtype: int64
-        >>> series.loc['b']
-        11
-        """
-        return _SeriesLocIndexer(self)
-
-    @property
-    def iloc(self):
-        """
-        Select values by position.
-
-        See also
-        --------
-        cudf.DataFrame.iloc
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([10, 20, 30])
-        >>> s
-        0    10
-        1    20
-        2    30
-        dtype: int64
-        >>> s.iloc[2]
-        30
-        """
-        return _SeriesIlocIndexer(self)
-
     @property
     def nullmask(self):
-        """The gpu buffer for the null-mask
-        """
+        """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
     def as_mask(self):
@@ -2790,37 +1907,6 @@ def astype(self, dtype, copy=False, errors="raise"):
                 pass
             return self
 
-    def argsort(self, ascending=True, na_position="last"):
-        """Returns a Series of int64 index that will sort the series.
-
-        Uses Thrust sort.
-
-        Returns
-        -------
-        result: Series
-
-        Examples
-        --------
-        >>> import cudf
-        >>> s = cudf.Series([3, 1, 2])
-        >>> s
-        0    3
-        1    1
-        2    2
-        dtype: int64
-        >>> s.argsort()
-        0    1
-        1    2
-        2    0
-        dtype: int32
-        >>> s[s.argsort()]
-        1    1
-        2    2
-        0    3
-        dtype: int64
-        """
-        return self._sort(ascending=ascending, na_position=na_position)[1]
-
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
@@ -2835,28 +1921,28 @@ def sort_values(
         na_position="last",
         ignore_index=False,
     ):
-        """
-        Sort by the values.
-
-        Sort a Series in ascending or descending order by some criterion.
+        """Sort by the values along either axis.
 
         Parameters
         ----------
-        ascending : bool, default True
-            If True, sort values in ascending order, otherwise descending.
+        ascending : bool or list of bool, default True
+            Sort ascending vs. descending. Specify list for multiple sort
+            orders. If this is a list of bools, must match the length of the
+            by.
         na_position : {‘first’, ‘last’}, default ‘last’
-            'first' puts nulls at the beginning, 'last' puts nulls at the end.
+            'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
 
         Returns
         -------
-        sorted_obj : cuDF Series
+        Series : Series with sorted values.
 
         Notes
         -----
         Difference from pandas:
-          * Not supporting: `inplace`, `kind`
+          * Support axis='index' only.
+          * Not supporting: inplace, kind
 
         Examples
         --------
@@ -2870,38 +1956,15 @@ def sort_values(
         1    5
         dtype: int64
         """
-
-        if inplace:
-            raise NotImplementedError("`inplace` not currently implemented.")
-        if kind != "quicksort":
-            raise NotImplementedError("`kind` not currently implemented.")
-        if axis != 0:
-            raise NotImplementedError("`axis` not currently implemented.")
-
-        if len(self) == 0:
-            return self
-        vals, inds = self._sort(ascending=ascending, na_position=na_position)
-        if not ignore_index:
-            index = self.index.take(inds)
-        else:
-            index = self.index
-        return vals.set_index(index)
-
-    def _n_largest_or_smallest(self, largest, n, keep):
-        direction = largest
-        if keep == "first":
-            if n < 0:
-                n = 0
-            return self.sort_values(ascending=not direction).head(n)
-        elif keep == "last":
-            data = self.sort_values(ascending=direction)
-            if n <= 0:
-                data = data[-n:-n]
-            else:
-                data = data.tail(n)
-            return data.reverse()
-        else:
-            raise ValueError('keep must be either "first", "last"')
+        return super().sort_values(
+            by=self.name,
+            axis=axis,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            ignore_index=ignore_index,
+        )
 
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
@@ -2963,7 +2026,7 @@ def nlargest(self, n=5, keep="first"):
         Brunei      434000
         dtype: int64
         """
-        return self._n_largest_or_smallest(n=n, keep=keep, largest=True)
+        return self._n_largest_or_smallest(True, n, [self.name], keep)
 
     def nsmallest(self, n=5, keep="first"):
         """
@@ -3038,22 +2101,29 @@ def nsmallest(self, n=5, keep="first"):
         Tuvalu      11300
         dtype: int64
         """
-        return self._n_largest_or_smallest(n=n, keep=keep, largest=False)
-
-    def _sort(self, ascending=True, na_position="last"):
-        """
-        Sort by values
+        return self._n_largest_or_smallest(False, n, [self.name], keep)
 
-        Returns
-        -------
-        2-tuple of key and index
-        """
-        col_keys, col_inds = self._column.sort_by_values(
-            ascending=ascending, na_position=na_position
+    def argsort(
+        self,
+        axis=0,
+        kind="quicksort",
+        order=None,
+        ascending=True,
+        na_position="last",
+    ):
+        obj = self.__class__._from_data(
+            {
+                None: super().argsort(
+                    axis=axis,
+                    kind=kind,
+                    order=order,
+                    ascending=ascending,
+                    na_position=na_position,
+                )
+            }
         )
-        sr_keys = self._from_data({self.name: col_keys}, self._index)
-        sr_inds = self._from_data({self.name: col_inds}, self._index)
-        return sr_keys, sr_inds
+        obj.name = self.name
+        return obj
 
     def replace(self, to_replace=None, value=None, *args, **kwargs):
         if is_dict_like(to_replace) and value is not None:
@@ -3219,31 +2289,44 @@ def one_hot_encoding(self, cats, dtype="float64"):
         3    0.0
         dtype: float64]
         """
+
+        warnings.warn(
+            "Series.one_hot_encoding is deprecated and will be removed in "
+            "future, use `get_dummies` instead.",
+            FutureWarning,
+        )
+
         if hasattr(cats, "to_arrow"):
             cats = cats.to_pandas()
         else:
             cats = pd.Series(cats, dtype="object")
         dtype = cudf.dtype(dtype)
 
-        def encode(cat):
-            if cat is None:
-                if self.dtype.kind == "f":
-                    # Need to ignore `np.nan` values incase
-                    # of a float column
-                    return self.__class__(
-                        libcudf.unary.is_null((self._column))
-                    )
-                else:
-                    return self.isnull()
-            elif np.issubdtype(type(cat), np.floating) and np.isnan(cat):
-                return self.__class__(libcudf.unary.is_nan(self._column))
-            else:
-                return (self == cat).fillna(False)
+        try:
+            cats_col = as_column(cats, nan_as_null=False, dtype=self.dtype)
+        except TypeError:
+            raise ValueError("Cannot convert `cats` as cudf column.")
+
+        if self._column.size * cats_col.size >= np.iinfo("int32").max:
+            raise ValueError(
+                "Size limitation exceeded: series.size * category.size < "
+                "np.iinfo('int32').max. Consider reducing size of category"
+            )
 
-        return [encode(cat).astype(dtype) for cat in cats]
+        res = libcudf.transform.one_hot_encode(self._column, cats_col)
+        if dtype.type == np.bool_:
+            return [
+                Series._from_data({None: x}, index=self._index)
+                for x in list(res.values())
+            ]
+        else:
+            return [
+                Series._from_data({None: x.astype(dtype)}, index=self._index)
+                for x in list(res.values())
+            ]
 
     def label_encoding(self, cats, dtype=None, na_sentinel=-1):
-        """Perform label encoding
+        """Perform label encoding.
 
         Parameters
         ----------
@@ -3300,6 +2383,10 @@ def label_encoding(self, cats, dtype=None, na_sentinel=-1):
             FutureWarning,
         )
 
+        return self._label_encoding(cats, dtype, na_sentinel)
+
+    def _label_encoding(self, cats, dtype=None, na_sentinel=-1):
+        # Private implementation of deprecated public label_encoding method
         def _return_sentinel_series():
             return Series(
                 cudf.core.column.full(
@@ -3366,7 +2453,10 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         Notes
         -----
         UDFs are cached in memory to avoid recompilation. The first
-        call to the UDF will incur compilation overhead.
+        call to the UDF will incur compilation overhead. `func` may
+        call nested functions that are decorated with the decorator
+        `numba.cuda.jit(device=True)`, otherwise numba will raise a
+        typing error.
 
         Examples
         --------
@@ -3382,6 +2472,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         dtype: int64
 
         Apply a basic function to a series with nulls
+
         >>> sr = cudf.Series([1,cudf.NA,3])
         >>> def f(x):
         ...     return x + 1
@@ -3393,6 +2484,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
 
         Use a function that does something conditionally,
         based on if the value is or is not null
+
         >>> sr = cudf.Series([1,cudf.NA,3])
         >>> def f(x):
         ...     if x is cudf.NA:
@@ -3419,16 +2511,21 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         1    <NA>
         2     4.5
         dtype: float64
-
-
-
         """
         if args or kwargs:
             raise ValueError(
                 "UDFs using *args or **kwargs are not yet supported."
             )
 
-        return super()._apply(func)
+        # these functions are generally written as functions of scalar
+        # values rather than rows. Rather than writing an entirely separate
+        # numba kernel that is not built around a row object, its simpler
+        # to just turn this into the equivalent single column dataframe case
+        name = self.name or "__temp_srname"
+        df = cudf.DataFrame({name: self})
+        f_ = cuda.jit(device=True)(func)
+
+        return df.apply(lambda row: f_(row[name]))
 
     def applymap(self, udf, out_dtype=None):
         """Apply an elementwise function to transform the values in the Column.
@@ -3640,8 +2737,11 @@ def mode(self, dropna=True):
         return Series(val_counts.index.sort_values(), name=self.name)
 
     def round(self, decimals=0, how="half_even"):
-        if not isinstance(decimals, int):
-            raise ValueError("decimals must be an int")
+        if not is_integer(decimals):
+            raise ValueError(
+                f"decimals must be an int, got {type(decimals).__name__}"
+            )
+        decimals = int(decimals)
         return super().round(decimals, how)
 
     def cov(self, other, min_periods=None):
@@ -3971,137 +3071,20 @@ def value_counts(
             res = res / float(res._column.sum())
         return res
 
-    def scale(self):
-        """
-        Scale values to [0, 1] in float64
-
-        Returns
-        -------
-        Series
-            A new series with values scaled to [0, 1].
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([10, 11, 12, 0.5, 1])
-        >>> series
-        0    10.0
-        1    11.0
-        2    12.0
-        3     0.5
-        4     1.0
-        dtype: float64
-        >>> series.scale()
-        0    0.826087
-        1    0.913043
-        2    1.000000
-        3    0.000000
-        4    0.043478
-        dtype: float64
-        """
-        vmin = self.min()
-        vmax = self.max()
-        scaled = (self - vmin) / (vmax - vmin)
-        scaled._index = self._index.copy(deep=False)
-        return scaled
-
-    # Absolute
-    def abs(self):
-        """Absolute value of each element of the series.
-
-        Returns
-        -------
-        abs
-            Series containing the absolute value of each element.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([-1.10, 2, -3.33, 4])
-        >>> series
-        0   -1.10
-        1    2.00
-        2   -3.33
-        3    4.00
-        dtype: float64
-        >>> series.abs()
-        0    1.10
-        1    2.00
-        2    3.33
-        3    4.00
-        dtype: float64
-        """
-        return self._unaryop("abs")
-
-    # Rounding
-    def ceil(self):
-        """
-        Rounds each value upward to the smallest integral value not less
-        than the original.
-
-        Returns
-        -------
-        res
-            Returns a new Series with ceiling value of each element.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([1.1, 2.8, 3.5, 4.5])
-        >>> series
-        0    1.1
-        1    2.8
-        2    3.5
-        3    4.5
-        dtype: float64
-        >>> series.ceil()
-        0    2.0
-        1    3.0
-        2    4.0
-        3    5.0
-        dtype: float64
-        """
-        return self._unaryop("ceil")
-
-    def floor(self):
-        """Rounds each value downward to the largest integral value not greater
-        than the original.
-
-        Returns
-        -------
-        res
-            Returns a new Series with floor of each element.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> series = cudf.Series([-1.9, 2, 0.2, 1.5, 0.0, 3.0])
-        >>> series
-        0   -1.9
-        1    2.0
-        2    0.2
-        3    1.5
-        4    0.0
-        5    3.0
-        dtype: float64
-        >>> series.floor()
-        0   -2.0
-        1    2.0
-        2    0.0
-        3    1.0
-        4    0.0
-        5    3.0
-        dtype: float64
-        """
-        return self._unaryop("floor")
-
-    def hash_values(self):
+    def hash_values(self, method="murmur3"):
         """Compute the hash of values in this column.
 
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
         Returns
         -------
-        cupy array
-            A cupy array with hash values.
+        Series
+            A Series with hash values.
 
         Examples
         --------
@@ -4112,14 +3095,27 @@ def hash_values(self):
         1    120
         2     30
         dtype: int64
-        >>> series.hash_values()
-        array([-1930516747,   422619251,  -941520876], dtype=int32)
+        >>> series.hash_values(method="murmur3")
+        0   -1930516747
+        1     422619251
+        2    -941520876
+        dtype: int32
+        >>> series.hash_values(method="md5")
+        0    7be4bbacbfdb05fb3044e36c22b41e8b
+        1    947ca8d2c5f0f27437f156cfbfab0969
+        2    d0580ef52d27c043c8e341fd5039b166
+        dtype: object
         """
-        return Series(self._hash()).values
+        return Series._from_data(
+            {None: self._hash(method=method)}, index=self.index
+        )
 
     def hash_encode(self, stop, use_name=False):
         """Encode column values as ints in [0, stop) using hash function.
 
+        This method is deprecated. Replace ``series.hash_encode(stop,
+        use_name=False)`` with ``series.hash_values(method="murmur3") % stop``.
+
         Parameters
         ----------
         stop : int
@@ -4154,17 +3150,41 @@ def hash_encode(self, stop, use_name=False):
         2     76
         dtype: int32
         """
+        warnings.warn(
+            "The `hash_encode` method will be removed in a future cuDF "
+            "release. Replace `series.hash_encode(stop, use_name=False)` "
+            'with `series.hash_values(method="murmur3") % stop`.',
+            FutureWarning,
+        )
+
         if not stop > 0:
             raise ValueError("stop must be a positive integer.")
 
-        initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
-        hashed_values = Series(self._hash(initial_hash))
+        if use_name:
+            name_hasher = sha256()
+            name_hasher.update(str(self.name).encode())
+            name_hash_bytes = name_hasher.digest()[:4]
+            name_hash_int = (
+                int.from_bytes(name_hash_bytes, "little", signed=False)
+                & 0xFFFFFFFF
+            )
+            initial_hash = [name_hash_int]
+        else:
+            initial_hash = None
+
+        hashed_values = Series._from_data(
+            {
+                self.name: self._hash(
+                    method="murmur3", initial_hash=initial_hash
+                )
+            },
+            self.index,
+        )
 
         if hashed_values.has_nulls:
             raise ValueError("Column must have no nulls.")
 
-        mod_vals = hashed_values % stop
-        return Series(mod_vals._column, index=self.index, name=self.name)
+        return hashed_values % stop
 
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
@@ -4791,6 +3811,64 @@ def pct_change(
         return change
 
 
+def make_binop_func(op):
+    # This function is used to wrap binary operations in Frame with an
+    # appropriate API for Series as required for pandas compatibility. The
+    # main effect is reordering and error-checking parameters in
+    # Series-specific ways.
+    wrapped_func = getattr(Frame, op)
+
+    @functools.wraps(wrapped_func)
+    def wrapper(self, other, level=None, fill_value=None, axis=0):
+        if axis != 0:
+            raise NotImplementedError("Only axis=0 supported at this time.")
+        return wrapped_func(self, other, axis, level, fill_value)
+
+    # functools.wraps copies module level attributes to `wrapper` and sets
+    # __wrapped__ attributes to `wrapped_func`. Cpython looks up the signature
+    # string of a function by recursively delving into __wrapped__ until
+    # it hits the first function that has __signature__ attribute set. To make
+    # the signature stirng of `wrapper` matches with its actual parameter list,
+    # we directly set the __signature__ attribute of `wrapper` below.
+
+    new_sig = inspect.signature(
+        lambda self, other, level=None, fill_value=None, axis=0: None
+    )
+    wrapper.__signature__ = new_sig
+    return wrapper
+
+
+# Wrap all Frame binop functions with the expected API for Series.
+for binop in (
+    "add",
+    "radd",
+    "subtract",
+    "sub",
+    "rsub",
+    "multiply",
+    "mul",
+    "rmul",
+    "mod",
+    "rmod",
+    "pow",
+    "rpow",
+    "floordiv",
+    "rfloordiv",
+    "truediv",
+    "div",
+    "divide",
+    "rtruediv",
+    "rdiv",
+    "eq",
+    "ne",
+    "lt",
+    "le",
+    "gt",
+    "ge",
+):
+    setattr(Series, binop, make_binop_func(binop))
+
+
 class DatetimeProperties(object):
     """
     Accessor object for datetimelike properties of the Series values.
@@ -6048,8 +5126,8 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     result_col = column.as_column(result)
 
     if a_col.null_count and b_col.null_count:
-        a_nulls = a_col.isna()
-        b_nulls = b_col.isna()
+        a_nulls = a_col.isnull()
+        b_nulls = b_col.isnull()
         null_values = a_nulls | b_nulls
 
         if equal_nan is True:
@@ -6057,9 +5135,9 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
 
         del a_nulls, b_nulls
     elif a_col.null_count:
-        null_values = a_col.isna()
+        null_values = a_col.isnull()
     elif b_col.null_count:
-        null_values = b_col.isna()
+        null_values = b_col.isnull()
     else:
         return Series(result_col, index=index)
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
new file mode 100644
index 00000000000..7793a2fdf29
--- /dev/null
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+"""Base class for Frame types that only have a single column."""
+
+from __future__ import annotations
+
+import warnings
+from typing import Any, Dict, MutableMapping, Optional, Tuple, TypeVar, Union
+
+import cupy
+import numpy as np
+import pandas as pd
+
+import cudf
+from cudf._typing import Dtype
+from cudf.api.types import _is_scalar_or_zero_d_array
+from cudf.core.column import ColumnBase, as_column
+from cudf.core.frame import Frame
+
+T = TypeVar("T", bound="Frame")
+
+
+class SingleColumnFrame(Frame):
+    """A one-dimensional frame.
+
+    Frames with only a single column share certain logic that is encoded in
+    this class.
+    """
+
+    _SUPPORT_AXIS_LOOKUP = {
+        0: 0,
+        None: 0,
+        "index": 0,
+    }
+
+    def _reduce(
+        self, op, axis=None, level=None, numeric_only=None, **kwargs,
+    ):
+        if axis not in (None, 0):
+            raise NotImplementedError("axis parameter is not implemented yet")
+
+        if level is not None:
+            raise NotImplementedError("level parameter is not implemented yet")
+
+        if numeric_only not in (None, True):
+            raise NotImplementedError(
+                "numeric_only parameter is not implemented yet"
+            )
+        return getattr(self._column, op)(**kwargs)
+
+    def _scan(self, op, axis=None, *args, **kwargs):
+        if axis not in (None, 0):
+            raise NotImplementedError("axis parameter is not implemented yet")
+
+        return super()._scan(op, axis=axis, *args, **kwargs)
+
+    @classmethod
+    def _from_data(
+        cls,
+        data: MutableMapping,
+        index: Optional[cudf.core.index.BaseIndex] = None,
+        name: Any = None,
+    ):
+
+        out = super()._from_data(data, index)
+        if name is not None:
+            out.name = name
+        return out
+
+    @property
+    def name(self):
+        """Get the name of this object."""
+        return next(iter(self._data.names))
+
+    @name.setter
+    def name(self, value):
+        self._data[value] = self._data.pop(self.name)
+
+    @property
+    def ndim(self):
+        """Get the dimensionality (always 1 for single-columned frames)."""
+        return 1
+
+    @property
+    def shape(self):
+        """Get a tuple representing the dimensionality of the Index."""
+        return (len(self),)
+
+    def __iter__(self):
+        # Iterating over a GPU object is not efficient and hence not supported.
+        # Consider using ``.to_arrow()``, ``.to_pandas()`` or ``.values_host``
+        # if you wish to iterate over the values.
+        cudf.utils.utils.raise_iteration_error(obj=self)
+
+    def __bool__(self):
+        raise TypeError(
+            f"The truth value of a {type(self)} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
+    @property
+    def _num_columns(self):
+        return 1
+
+    @property
+    def _column(self):
+        return self._data[self.name]
+
+    @_column.setter
+    def _column(self, value):
+        self._data[self.name] = value
+
+    @property
+    def values(self):  # noqa: D102
+        return self._column.values
+
+    @property
+    def values_host(self):  # noqa: D102
+        return self._column.values_host
+
+    def to_cupy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = True,
+        na_value=None,
+    ) -> cupy.ndarray:  # noqa: D102
+        return super().to_cupy(dtype, copy, na_value).flatten()
+
+    def to_numpy(
+        self,
+        dtype: Union[Dtype, None] = None,
+        copy: bool = True,
+        na_value=None,
+    ) -> np.ndarray:  # noqa: D102
+        return super().to_numpy(dtype, copy, na_value).flatten()
+
+    def tolist(self):  # noqa: D102
+
+        raise TypeError(
+            "cuDF does not support conversion to host memory "
+            "via the `tolist()` method. Consider using "
+            "`.to_arrow().to_pylist()` to construct a Python list."
+        )
+
+    to_list = tolist
+
+    # TODO: When this method is removed we can also remove
+    # ColumnBase.to_gpu_array.
+    def to_gpu_array(self, fillna=None):  # noqa: D102
+        warnings.warn(
+            "The to_gpu_array method will be removed in a future cuDF "
+            "release. Consider using `to_cupy` instead.",
+            FutureWarning,
+        )
+        return self._column.to_gpu_array(fillna=fillna)
+
+    @classmethod
+    def from_arrow(cls, array):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
+        StringIndex(['a' 'b' None], dtype='object')
+        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
+        0       a
+        1       b
+        2    <NA>
+        dtype: object
+        """
+        return cls(ColumnBase.from_arrow(array))
+
+    def to_arrow(self):
+        """
+        Convert to a PyArrow Array.
+
+        Returns
+        -------
+        PyArrow Array
+
+        Examples
+        --------
+        >>> import cudf
+        >>> sr = cudf.Series(["a", "b", None])
+        >>> sr.to_arrow()
+        <pyarrow.lib.StringArray object at 0x7f796b0e7600>
+        [
+          "a",
+          "b",
+          null
+        ]
+        >>> ind = cudf.Index(["a", "b", None])
+        >>> ind.to_arrow()
+        <pyarrow.lib.StringArray object at 0x7f796b0e7750>
+        [
+          "a",
+          "b",
+          null
+        ]
+        """
+        return self._column.to_arrow()
+
+    @property
+    def is_unique(self):
+        """Return boolean if values in the object are unique.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_unique
+
+    @property
+    def is_monotonic(self):
+        """Return boolean if values in the object are monotonically increasing.
+
+        This property is an alias for :attr:`is_monotonic_increasing`.
+
+        Returns
+        -------
+        bool
+        """
+        return self.is_monotonic_increasing
+
+    @property
+    def is_monotonic_increasing(self):
+        """Return boolean if values in the object are monotonically increasing.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_monotonic_increasing
+
+    @property
+    def is_monotonic_decreasing(self):
+        """Return boolean if values in the object are monotonically decreasing.
+
+        Returns
+        -------
+        bool
+        """
+        return self._column.is_monotonic_decreasing
+
+    @property
+    def __cuda_array_interface__(self):
+        return self._column.__cuda_array_interface__
+
+    def factorize(self, na_sentinel=-1):
+        """Encode the input values as integer labels.
+
+        Parameters
+        ----------
+        na_sentinel : number
+            Value to indicate missing category.
+
+        Returns
+        --------
+        (labels, cats) : (cupy.ndarray, cupy.ndarray or Index)
+            - *labels* contains the encoded values
+            - *cats* contains the categories in order that the N-th
+              item corresponds to the (N-1) code.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(['a', 'a', 'c'])
+        >>> codes, uniques = s.factorize()
+        >>> codes
+        array([0, 0, 1], dtype=int8)
+        >>> uniques
+        StringIndex(['a' 'c'], dtype='object')
+        """
+        return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel)
+
+    def _make_operands_for_binop(
+        self,
+        other: T,
+        fill_value: Any = None,
+        reflect: bool = False,
+        *args,
+        **kwargs,
+    ) -> Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]:
+        """Generate the dictionary of operands used for a binary operation.
+
+        Parameters
+        ----------
+        other : SingleColumnFrame
+            The second operand.
+        fill_value : Any, default None
+            The value to replace null values with. If ``None``, nulls are not
+            filled before the operation.
+        reflect : bool, default False
+            If ``True``, swap the order of the operands. See
+            https://docs.python.org/3/reference/datamodel.html#object.__ror__
+            for more information on when this is necessary.
+
+        Returns
+        -------
+        Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]]
+            The operands to be passed to _colwise_binop.
+        """
+        # Get the appropriate name for output operations involving two objects
+        # that are Series-like objects. The output shares the lhs's name unless
+        # the rhs is a _differently_ named Series-like object.
+        if (
+            isinstance(other, (SingleColumnFrame, pd.Series, pd.Index))
+            and self.name != other.name
+        ):
+            result_name = None
+        else:
+            result_name = self.name
+
+        # This needs to be tested correctly
+        if isinstance(other, SingleColumnFrame):
+            other = other._column
+        elif not _is_scalar_or_zero_d_array(other):
+            # Non-scalar right operands are valid iff they convert to columns.
+            try:
+                other = as_column(other)
+            except Exception:
+                return NotImplemented
+
+        return {result_name: (self._column, other, reflect, fill_value)}
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 60139f7d7af..3502fc9acae 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -134,12 +134,12 @@ def __call__(
         Examples
         --------
         >>> import cudf
-        >>> from cudf.utils.hash_vocab_utils  import hash_vocab
+        >>> from cudf.utils.hash_vocab_utils import hash_vocab
         >>> hash_vocab('bert-base-cased-vocab.txt', 'voc_hash.txt')
 
 
         >>> from cudf.core.subword_tokenizer import SubwordTokenizer
-        >>> cudf_tokenizer  = SubwordTokenizer('voc_hash.txt',
+        >>> cudf_tokenizer = SubwordTokenizer('voc_hash.txt',
         ...                                    do_lower_case=True)
         >>> str_series = cudf.Series(['This is the', 'best book'])
         >>> tokenizer_output = cudf_tokenizer(str_series,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index e17c58d1db7..31fb7c7023c 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,8 +1,11 @@
 # Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
+import math
+import re
 import warnings
 from typing import Sequence, Union
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 from pandas.core.tools.datetimes import _unit_map
@@ -26,6 +29,20 @@
     "D": "datetime64[s]",
 }
 
+_offset_alias_to_code = {
+    "D": "D",
+    "H": "h",
+    "h": "h",
+    "T": "m",
+    "min": "m",
+    "s": "s",
+    "S": "s",
+    "U": "us",
+    "us": "us",
+    "N": "ns",
+    "ns": "ns",
+}
+
 
 def to_datetime(
     arg,
@@ -614,6 +631,18 @@ def _from_freqstr(cls, freqstr):
 
         return cls(**{cls._CODES_TO_UNITS[freq_part]: int(numeric_part)})
 
+    def _maybe_as_fast_pandas_offset(self):
+        if (
+            len(self.kwds) == 1
+            and _has_fixed_frequency(self)
+            and not _has_non_fixed_frequency(self)
+        ):
+            # Pandas computation between `n*offsets.Minute()` is faster than
+            # `n*DateOffset`. If only single offset unit is in use, we return
+            # the base offset for faster binary ops.
+            return pd.tseries.frequencies.to_offset(pd.Timedelta(**self.kwds))
+        return pd.DateOffset(**self.kwds, n=1)
+
 
 def _isin_datetimelike(
     lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence
@@ -655,6 +684,276 @@ def _isin_datetimelike(
     return res
 
 
+def date_range(
+    start=None,
+    end=None,
+    periods=None,
+    freq=None,
+    tz=None,
+    normalize=False,
+    name=None,
+    closed=None,
+):
+    """Return a fixed frequency DatetimeIndex.
+
+    Returns the range of equally spaced time points (where the difference
+    between any two adjacent points is specified by the given frequency)
+    such that they all satisfy `start` <[=] x <[=] `end`, where the first one
+    and the last one are, resp., the first and last time points in that range
+    that are valid for `freq`.
+
+    Parameters
+    ----------
+    start : str or datetime-like, optional
+        Left bound for generating dates.
+
+    end : str or datetime-like, optional
+        Right bound for generating dates.
+
+    periods : int, optional
+        Number of periods to generate.
+
+    freq : str or DateOffset
+        Frequencies to generate the datetime series. Mixed fixed-frequency and
+        non-fixed frequency offset is unsupported. See notes for detail.
+        Supported offset alias: ``D``, ``h``, ``H``, ``T``, ``min``, ``S``,
+        ``U``, ``us``, ``N``, ``ns``.
+
+    tz : str or tzinfo, optional
+        Not Supported
+
+    normalize : bool, default False
+        Not Supported
+
+    name : str, default None
+        Name of the resulting DatetimeIndex
+
+    closed : {None, 'left', 'right'}, optional
+        Not Supported
+
+    Returns
+    -------
+    DatetimeIndex
+
+    Notes
+    -----
+    Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
+    must be specified. If `freq` is omitted, the resulting DatetimeIndex will
+    have periods linearly spaced elements between start and end (closed on both
+    sides).
+
+    cudf supports `freq` specified with either fixed-frequency offset
+    (such as weeks, days, hours, minutes...) or non-fixed frequency offset
+    (such as years and months). Specifying `freq` with a mixed fixed and
+    non-fixed frequency is currently unsupported. For example:
+
+    >>> cudf.date_range(
+    ...     start='2021-08-23 08:00:00',
+    ...     freq=cudf.DateOffset(months=2, days=5),
+    ...     periods=5)
+    ...
+    NotImplementedError: Mixing fixed and non-fixed frequency offset is
+    unsupported.
+
+    Examples
+    --------
+    >>> cudf.date_range(
+    ...     start='2021-08-23 08:00:00',
+    ...     freq=cudf.DateOffset(years=1, months=2),
+    ...     periods=5)
+    DatetimeIndex(['2021-08-23 08:00:00', '2022-10-23 08:00:00',
+                '2023-12-23 08:00:00', '2025-02-23 08:00:00',
+                '2026-04-23 08:00:00'],
+                dtype='datetime64[ns]')
+
+    """
+    if tz is not None:
+        raise NotImplementedError("tz is currently unsupported.")
+
+    if closed is not None:
+        raise NotImplementedError("closed is currently unsupported.")
+
+    if (start, end, periods, freq).count(None) > 1:
+        raise ValueError(
+            "Of the four parameters: start, end, periods, and freq, exactly "
+            "three must be specified"
+        )
+
+    dtype = np.dtype("<M8[ns]")
+
+    if freq is None:
+        # `start`, `end`, `periods` is specified, we treat the timestamps as
+        # integers and divide the number range evenly with `periods` elements.
+        start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
+        end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
+        arr = cp.linspace(start=start, stop=end, num=periods)
+        result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+        return cudf.DatetimeIndex._from_data({name: result})
+
+    # The code logic below assumes `freq` is defined. It is first normalized
+    # into `DateOffset` for further computation with timestamps.
+
+    if isinstance(freq, DateOffset):
+        offset = freq
+    elif isinstance(freq, str):
+        # Map pandas `offset alias` into cudf DateOffset `CODE`, only
+        # fixed-frequency, non-anchored offset aliases are supported.
+        mo = re.fullmatch(
+            rf'(-)*(\d*)({"|".join(_offset_alias_to_code.keys())})', freq
+        )
+        if mo is None:
+            raise ValueError(
+                f"Unrecognized or unsupported offset alias {freq}."
+            )
+
+        sign, n, offset_alias = mo.groups()
+        code = _offset_alias_to_code[offset_alias]
+
+        freq = "".join([n, code])
+        offset = DateOffset._from_freqstr(freq)
+        if sign:
+            offset.kwds.update({s: -i for s, i in offset.kwds.items()})
+    else:
+        raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
+
+    if _has_mixed_freqeuency(offset):
+        raise NotImplementedError(
+            "Mixing fixed and non-fixed frequency offset is unsupported."
+        )
+
+    # Depending on different combinations of `start`, `end`, `offset`,
+    # `periods`, the following logic makes sure before computing the sequence,
+    # `start`, `periods`, `offset` is defined
+
+    _periods_not_specified = False
+
+    if start is None:
+        end = cudf.Scalar(end, dtype=dtype)
+        start = cudf.Scalar(
+            pd.Timestamp(end.value)
+            - (periods - 1) * offset._maybe_as_fast_pandas_offset(),
+            dtype=dtype,
+        )
+    elif end is None:
+        start = cudf.Scalar(start, dtype=dtype)
+    elif periods is None:
+        # When `periods` is unspecified, its upper bound estimated by
+        # dividing the number of nanoseconds between two timestamps with
+        # the lower bound of `freq` in nanoseconds. While the final result
+        # may contain extra elements that exceeds `end`, they are trimmed
+        # as a post processing step. [1]
+        _periods_not_specified = True
+        start = cudf.Scalar(start, dtype=dtype)
+        end = cudf.Scalar(end, dtype=dtype)
+        _is_increment_sequence = end >= start
+
+        periods = math.ceil(
+            int(end - start) / _offset_to_nanoseconds_lower_bound(offset)
+        )
+
+        if periods < 0:
+            # Mismatched sign between (end-start) and offset, return empty
+            # column
+            periods = 0
+        elif periods == 0:
+            # end == start, return exactly 1 timestamp (start)
+            periods = 1
+
+    # The estimated upper bound of `end` is enforced to be computed to make
+    # sure overflow components are raised before actually computing the
+    # sequence.
+    # FIXME: when `end_estim` is out of bound, but the actual `end` is not,
+    # we shouldn't raise but compute the sequence as is. The trailing overflow
+    # part should get trimmed at the end.
+    end_estim = (
+        pd.Timestamp(start.value)
+        + (periods - 1) * offset._maybe_as_fast_pandas_offset()
+    ).to_datetime64()
+
+    if "months" in offset.kwds or "years" in offset.kwds:
+        # If `offset` is non-fixed frequency, resort to libcudf.
+        res = libcudf.datetime.date_range(start.device_value, periods, offset)
+        if _periods_not_specified:
+            # As mentioned in [1], this is a post processing step to trim extra
+            # elements when `periods` is an estimated value. Only offset
+            # specified with non fixed frequencies requires trimming.
+            res = res[
+                (res <= end) if _is_increment_sequence else (res <= start)
+            ]
+    else:
+        # If `offset` is fixed frequency, we treat both timestamps as integers
+        # and evenly divide the given integer range.
+        arr = cp.linspace(
+            start=start.value.astype("int64"),
+            stop=end_estim.astype("int64"),
+            num=periods,
+        )
+        res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+
+    return cudf.DatetimeIndex._from_data({name: res})
+
+
+def _has_fixed_frequency(freq: DateOffset) -> bool:
+    """Utility to determine if `freq` contains fixed frequency offset
+    """
+    fixed_frequencies = {
+        "weeks",
+        "days",
+        "hours",
+        "minutes",
+        "seconds",
+        "milliseconds",
+        "microseconds",
+        "nanoseconds",
+    }
+
+    return len(freq.kwds.keys() & fixed_frequencies) > 0
+
+
+def _has_non_fixed_frequency(freq: DateOffset) -> bool:
+    """Utility to determine if `freq` contains non-fixed frequency offset
+    """
+    non_fixed_frequencies = {"years", "months"}
+    return len(freq.kwds.keys() & non_fixed_frequencies) > 0
+
+
+def _has_mixed_freqeuency(freq: DateOffset) -> bool:
+    """Utility to determine if `freq` contains mixed fixed and non-fixed
+    frequency offset. e.g. {months=1, days=5}
+    """
+
+    return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq)
+
+
+def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
+    """Given a DateOffset, which can consist of either fixed frequency or
+    non-fixed frequency offset, convert to the smallest possible fixed
+    frequency offset based in nanoseconds.
+
+    Specifically, the smallest fixed frequency conversion for {months=1}
+    is 28 * nano_seconds_per_day, because 1 month contains at least 28 days.
+    Similarly, the smallest fixed frequency conversion for {year=1} is
+    365 * nano_seconds_per_day.
+
+    This utility is used to compute the upper bound of the count of timestamps
+    given a range of datetime and an offset.
+    """
+    nanoseconds_per_day = 24 * 60 * 60 * 1e9
+    kwds = offset.kwds
+    return (
+        kwds.get("years", 0) * (365 * nanoseconds_per_day)
+        + kwds.get("months", 0) * (28 * nanoseconds_per_day)
+        + kwds.get("weeks", 0) * (7 * nanoseconds_per_day)
+        + kwds.get("days", 0) * nanoseconds_per_day
+        + kwds.get("hours", 0) * 3600 * 1e9
+        + kwds.get("minutes", 0) * 60 * 1e9
+        + kwds.get("seconds", 0) * 1e9
+        + kwds.get("milliseconds", 0) * 1e6
+        + kwds.get("microseconds", 0) * 1e3
+        + kwds.get("nanoseconds", 0)
+    )
+
+
 def _to_iso_calendar(arg):
     formats = ["%G", "%V", "%u"]
     if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)):
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 8c69b94cc84..7c688b92009 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -226,8 +226,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
 
 def _proc_inf_empty_strings(col):
-    """Handles empty and infinity strings
-    """
+    """Handles empty and infinity strings"""
     col = libstrings.to_lower(col)
     col = _proc_empty_strings(col)
     col = _proc_inf_strings(col)
@@ -235,8 +234,7 @@ def _proc_inf_empty_strings(col):
 
 
 def _proc_empty_strings(col):
-    """Replaces empty strings with NaN
-    """
+    """Replaces empty strings with NaN"""
     s = cudf.Series(col)
     s = s.where(s != "", "NaN")
     return s._column
@@ -246,6 +244,8 @@ def _proc_inf_strings(col):
     """Convert "inf/infinity" strings into "Inf", the native string
     representing infinity in libcudf
     """
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
     col = libstrings.replace_multi(
         col, as_column(["+", "inf", "inity"]), as_column(["", "Inf", ""]),
     )
diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py
index 25201356fd9..7307b43fceb 100644
--- a/python/cudf/cudf/core/udf/_ops.py
+++ b/python/cudf/cudf/core/udf/_ops.py
@@ -1,3 +1,4 @@
+import math
 import operator
 
 arith_ops = [
@@ -10,6 +11,42 @@
     operator.pow,
 ]
 
+bitwise_ops = [operator.and_, operator.or_, operator.xor]
+
+unary_ops = [
+    math.acos,
+    math.acosh,
+    math.asin,
+    math.asinh,
+    math.atan,
+    math.atanh,
+    math.ceil,
+    math.cos,
+    math.degrees,
+    math.erf,
+    math.erfc,
+    math.exp,
+    math.expm1,
+    math.fabs,
+    math.floor,
+    math.gamma,
+    math.lgamma,
+    math.log,
+    math.log10,
+    math.log1p,
+    math.log2,
+    math.radians,
+    math.sin,
+    math.sinh,
+    math.sqrt,
+    math.tan,
+    math.tanh,
+    operator.pos,
+    operator.neg,
+    operator.not_,
+    operator.invert,
+]
+
 comparison_ops = [
     operator.eq,
     operator.ne,
diff --git a/python/cudf/cudf/core/udf/lowering.py b/python/cudf/cudf/core/udf/lowering.py
index 3986abc2bf0..3b6b3b4b831 100644
--- a/python/cudf/cudf/core/udf/lowering.py
+++ b/python/cudf/cudf/core/udf/lowering.py
@@ -10,7 +10,12 @@
 from numba.extending import lower_builtin, types
 
 from cudf.core.udf import api
-from cudf.core.udf._ops import arith_ops, comparison_ops
+from cudf.core.udf._ops import (
+    arith_ops,
+    bitwise_ops,
+    comparison_ops,
+    unary_ops,
+)
 from cudf.core.udf.typing import MaskedType, NAType
 
 
@@ -78,6 +83,49 @@ def masked_scalar_op_impl(context, builder, sig, args):
     return masked_scalar_op_impl
 
 
+def make_unary_op(op):
+    """
+    Make closures that implement unary operations. See register_unary_op for
+    details.
+    """
+
+    def masked_scalar_unary_op_impl(context, builder, sig, args):
+        """
+        Implement <op> `MaskedType`
+        """
+        # MaskedType(...)
+        masked_type_1 = sig.args[0]
+        # MaskedType(...)
+        masked_return_type = sig.return_type
+
+        m1 = cgutils.create_struct_proxy(masked_type_1)(
+            context, builder, value=args[0]
+        )
+
+        # we will return an output struct
+        result = cgutils.create_struct_proxy(masked_return_type)(
+            context, builder
+        )
+
+        # compute output validity
+        result.valid = m1.valid
+        with builder.if_then(m1.valid):
+            # Let numba handle generating the extra IR needed to perform
+            # operations on mixed types, by compiling the final core op between
+            # the two primitive values as a separate function and calling it
+            result.value = context.compile_internal(
+                builder,
+                lambda x: op(x),
+                nb_signature(
+                    masked_return_type.value_type, masked_type_1.value_type,
+                ),
+                (m1.value,),
+            )
+        return result._getvalue()
+
+    return masked_scalar_unary_op_impl
+
+
 def register_arithmetic_op(op):
     """
     Register a lowering implementation for the
@@ -95,6 +143,23 @@ def register_arithmetic_op(op):
     cuda_lower(op, MaskedType, MaskedType)(to_lower_op)
 
 
+def register_unary_op(op):
+    """
+    Register a lowering implementation for the
+    unary op `op`.
+
+    Because the lowering implementations compile the final
+    op separately using a lambda and compile_internal, `op`
+    needs to be tied to each lowering implementation using
+    a closure.
+
+    This function makes and lowers a closure for one op.
+
+    """
+    to_lower_op = make_unary_op(op)
+    cuda_lower(op, MaskedType)(to_lower_op)
+
+
 def masked_scalar_null_op_impl(context, builder, sig, args):
     """
     Implement `MaskedType` <op> `NAType`
@@ -155,15 +220,23 @@ def register_const_op(op):
     cuda_lower(op, types.Number, MaskedType)(to_lower_op)
     cuda_lower(op, MaskedType, types.Boolean)(to_lower_op)
     cuda_lower(op, types.Boolean, MaskedType)(to_lower_op)
+    cuda_lower(op, MaskedType, types.NPDatetime)(to_lower_op)
+    cuda_lower(op, types.NPDatetime, MaskedType)(to_lower_op)
+    cuda_lower(op, MaskedType, types.NPTimedelta)(to_lower_op)
+    cuda_lower(op, types.NPTimedelta, MaskedType)(to_lower_op)
 
 
 # register all lowering at init
-for op in arith_ops + comparison_ops:
-    register_arithmetic_op(op)
-    register_const_op(op)
+for binary_op in arith_ops + bitwise_ops + comparison_ops:
+    register_arithmetic_op(binary_op)
+    register_const_op(binary_op)
     # null op impl can be shared between all ops
-    cuda_lower(op, MaskedType, NAType)(masked_scalar_null_op_impl)
-    cuda_lower(op, NAType, MaskedType)(masked_scalar_null_op_impl)
+    cuda_lower(binary_op, MaskedType, NAType)(masked_scalar_null_op_impl)
+    cuda_lower(binary_op, NAType, MaskedType)(masked_scalar_null_op_impl)
+
+# register all lowering at init
+for unary_op in unary_ops:
+    register_unary_op(unary_op)
 
 
 @cuda_lower(operator.is_, MaskedType, NAType)
@@ -202,6 +275,8 @@ def pack_return_masked_impl(context, builder, sig, args):
 
 @cuda_lower(api.pack_return, types.Boolean)
 @cuda_lower(api.pack_return, types.Number)
+@cuda_lower(api.pack_return, types.NPDatetime)
+@cuda_lower(api.pack_return, types.NPTimedelta)
 def pack_return_scalar_impl(context, builder, sig, args):
     outdata = cgutils.create_struct_proxy(sig.return_type)(context, builder)
     outdata.value = args[0]
@@ -269,7 +344,10 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
 
 
 # Masked constructor for use in a kernel for testing
+@lower_builtin(api.Masked, types.Boolean, types.boolean)
 @lower_builtin(api.Masked, types.Number, types.boolean)
+@lower_builtin(api.Masked, types.NPDatetime, types.boolean)
+@lower_builtin(api.Masked, types.NPTimedelta, types.boolean)
 def masked_constructor(context, builder, sig, args):
     ty = sig.return_type
     value, valid = args
diff --git a/python/cudf/cudf/core/udf/pipeline.py b/python/cudf/cudf/core/udf/pipeline.py
index 7f3aa7baa93..deb4546e8b8 100644
--- a/python/cudf/cudf/core/udf/pipeline.py
+++ b/python/cudf/cudf/core/udf/pipeline.py
@@ -1,67 +1,163 @@
+import math
+from typing import Callable
+
 import cachetools
 import numpy as np
-from numba import cuda
+from numba import cuda, typeof
 from numba.np import numpy_support
-from numba.types import Tuple, boolean, int64, void
+from numba.types import Poison, Record, Tuple, boolean, int64, void
 from nvtx import annotate
 
+from cudf.core.dtypes import CategoricalDtype
 from cudf.core.udf.api import Masked, pack_return
 from cudf.core.udf.typing import MaskedType
 from cudf.utils import cudautils
+from cudf.utils.dtypes import (
+    BOOL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+)
 
 libcudf_bitmask_type = numpy_support.from_dtype(np.dtype("int32"))
 MASK_BITSIZE = np.dtype("int32").itemsize * 8
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 
+JIT_SUPPORTED_TYPES = (
+    NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES
+)
 
-@annotate("NUMBA JIT", color="green", domain="cudf_python")
-def get_udf_return_type(func, dtypes):
-    """
-    Get the return type of a masked UDF for a given set of argument dtypes. It
-    is assumed that a `MaskedType(dtype)` is passed to the function for each
-    input dtype.
+
+def _is_jit_supported_type(dtype):
+    # category dtype isn't hashable
+    if isinstance(dtype, CategoricalDtype):
+        return False
+    return str(dtype) in JIT_SUPPORTED_TYPES
+
+
+def all_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        if _is_jit_supported_type(col.dtype)
+        else np.dtype("O")
+        for colname, col in frame._data.items()
+    }
+
+
+def supported_dtypes_from_frame(frame):
+    return {
+        colname: col.dtype
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def supported_cols_from_frame(frame):
+    return {
+        colname: col
+        for colname, col in frame._data.items()
+        if _is_jit_supported_type(col.dtype)
+    }
+
+
+def generate_cache_key(frame, func: Callable):
+    """Create a cache key that uniquely identifies a compilation.
+
+    A new compilation is needed any time any of the following things change:
+    - The UDF itself as defined in python by the user
+    - The types of the columns utilized by the UDF
+    - The existence of the input columns masks
     """
-    to_compiler_sig = tuple(
-        MaskedType(arg)
-        for arg in (numpy_support.from_dtype(np_type) for np_type in dtypes)
+    return (
+        *cudautils.make_cache_key(func, all_dtypes_from_frame(frame).values()),
+        *(col.mask is None for col in frame._data.values()),
+        *frame._data.keys(),
     )
-    # Get the return type. The PTX is also returned by compile_udf, but is not
-    # needed here.
-    ptx, output_type = cudautils.compile_udf(func, to_compiler_sig)
 
-    if not isinstance(output_type, MaskedType):
-        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
-    else:
-        numba_output_type = output_type
 
-    return numba_output_type
+def get_frame_row_type(dtype):
+    """
+    Get the numba `Record` type corresponding to a frame.
+    Models each column and its mask as a MaskedType and
+    models the row as a dictionary like data structure
+    containing these MaskedTypes.
+
+    Large parts of this function are copied with comments
+    from the Numba internals and slightly modified to
+    account for validity bools to be present in the final
+    struct.
+    """
 
+    # Create the numpy structured type corresponding to the numpy dtype.
+
+    fields = []
+    offset = 0
+
+    sizes = [val[0].itemsize for val in dtype.fields.values()]
+    for i, (name, info) in enumerate(dtype.fields.items()):
+        # *info* consists of the element dtype, its offset from the beginning
+        # of the record, and an optional "title" containing metadata.
+        # We ignore the offset in info because its value assumes no masking;
+        # instead, we compute the correct offset based on the masked type.
+        elemdtype = info[0]
+        title = info[2] if len(info) == 3 else None
+        ty = numpy_support.from_dtype(elemdtype)
+        infos = {
+            "type": MaskedType(ty),
+            "offset": offset,
+            "title": title,
+        }
+        fields.append((name, infos))
+
+        # increment offset by itemsize plus one byte for validity
+        offset += elemdtype.itemsize + 1
+
+        # Align the next member of the struct to be a multiple of the
+        # memory access size, per PTX ISA 7.4/5.4.5
+        if i < len(sizes) - 1:
+            next_itemsize = sizes[i + 1]
+            offset = int(math.ceil(offset / next_itemsize) * next_itemsize)
+
+    # Numba requires that structures are aligned for the CUDA target
+    _is_aligned_struct = True
+    return Record(fields, offset, _is_aligned_struct)
 
-def nulludf(func):
-    """
-    Mimic pandas API:
 
-    def f(x, y):
-        return x + y
-    df.apply(lambda row: f(row['x'], row['y']))
+@annotate("NUMBA JIT", color="green", domain="cudf_python")
+def get_udf_return_type(frame, func: Callable, args=()):
 
-    in this scheme, `row` is actually the whole dataframe
-    `DataFrame` sends `self` in as `row` and subsequently
-    we end up calling `f` on the resulting columns since
-    the dataframe is dict-like
+    """
+    Get the return type of a masked UDF for a given set of argument dtypes. It
+    is assumed that the function consumes a dictionary whose keys are strings
+    and whose values are of MaskedType. Initially assume that the UDF may be
+    written to utilize any field in the row - including those containing an
+    unsupported dtype. If an unsupported dtype is actually used in the function
+    the compilation should fail at `compile_udf`. If compilation succeeds, one
+    can infer that the function does not use any of the columns of unsupported
+    dtype - meaning we can drop them going forward and the UDF will still end
+    up getting fed rows containing all the fields it actually needs to use to
+    compute the answer for that row.
     """
 
-    def wrapper(*args):
-        from cudf import DataFrame
+    # present a row containing all fields to the UDF and try and compile
+    row_type = get_frame_row_type(
+        np.dtype(list(all_dtypes_from_frame(frame).items()))
+    )
+    compile_sig = (row_type, *(typeof(arg) for arg in args))
 
-        # This probably creates copies but is fine for now
-        to_udf_table = DataFrame(
-            {idx: arg for idx, arg in zip(range(len(args)), args)}
-        )
-        # Frame._apply
-        return to_udf_table._apply(func)
+    # Get the return type. The PTX is also returned by compile_udf, but is not
+    # needed here.
+    ptx, output_type = cudautils.compile_udf(func, compile_sig)
+    if not isinstance(output_type, MaskedType):
+        numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
+    else:
+        numba_output_type = output_type
 
-    return wrapper
+    return (
+        numba_output_type
+        if not isinstance(numba_output_type, MaskedType)
+        else numba_output_type.value_type
+    )
 
 
 def masked_array_type_from_col(col):
@@ -78,23 +174,23 @@ def masked_array_type_from_col(col):
         return Tuple((nb_scalar_ty[::1], libcudf_bitmask_type[::1]))
 
 
-def construct_signature(df, return_type):
+def construct_signature(frame, return_type, args):
     """
     Build the signature of numba types that will be used to
     actually JIT the kernel itself later, accounting for types
-    and offsets
+    and offsets. Skips columns with unsupported dtypes.
     """
 
     # Tuple of arrays, first the output data array, then the mask
     return_type = Tuple((return_type[::1], boolean[::1]))
     offsets = []
-    sig = [return_type]
-    for col in df._data.values():
+    sig = [return_type, int64]
+    for col in supported_cols_from_frame(frame).values():
         sig.append(masked_array_type_from_col(col))
         offsets.append(int64)
 
-    # return_type + data,masks + offsets + size
-    sig = void(*(sig + offsets + [int64]))
+    # return_type, size, data, masks, offsets, extra args
+    sig = void(*(sig + offsets + [typeof(arg) for arg in args]))
 
     return sig
 
@@ -105,12 +201,23 @@ def mask_get(mask, pos):
 
 
 kernel_template = """\
-def _kernel(retval, {input_columns}, {input_offsets}, size):
+def _kernel(retval, size, {input_columns}, {input_offsets}, {extra_args}):
     i = cuda.grid(1)
     ret_data_arr, ret_mask_arr = retval
     if i < size:
+        # Create a structured array with the desired fields
+        rows = cuda.local.array(1, dtype=row_type)
+
+        # one element of that array
+        row = rows[0]
+
 {masked_input_initializers}
-        ret = {user_udf_call}
+{row_initializers}
+
+        # pass the assembled row into the udf
+        ret = f_(row, {extra_args})
+
+        # pack up the return values and set them
         ret_masked = pack_return(ret)
         ret_data_arr[i] = ret_masked.value
         ret_mask_arr[i] = ret_masked.valid
@@ -126,19 +233,55 @@ def _kernel(retval, {input_columns}, {input_offsets}, size):
         masked_{idx} = Masked(d_{idx}[i], mask_get(m_{idx}, i + offset_{idx}))
 """
 
+row_initializer_template = """\
+        row["{name}"] = masked_{idx}
+"""
+
 
-def _define_function(df, scalar_return=False):
+def _define_function(frame, row_type, args):
+    """
+    The kernel we want to JIT compile looks something like the following,
+    which is an example for two columns that both have nulls present
+
+    def _kernel(retval, input_col_0, input_col_1, offset_0, offset_1, size):
+        i = cuda.grid(1)
+        ret_data_arr, ret_mask_arr = retval
+        if i < size:
+            rows = cuda.local.array(1, dtype=row_type)
+            row = rows[0]
+
+            d_0, m_0 = input_col_0
+            masked_0 = Masked(d_0[i], mask_get(m_0, i + offset_0))
+            d_1, m_1 = input_col_1
+            masked_1 = Masked(d_1[i], mask_get(m_1, i + offset_1))
+
+            row["a"] = masked_0
+            row["b"] = masked_1
+
+            ret = f_(row)
+
+            ret_masked = pack_return(ret)
+            ret_data_arr[i] = ret_masked.value
+            ret_mask_arr[i] = ret_masked.valid
+
+    However we do not always have two columns and columns do not always have
+    an associated mask. Ideally, we would just write one kernel and make use
+    of `*args` - and then one function would work for any number of columns,
+    currently numba does not support `*args` and treats functions it JITs as
+    if `*args` is a singular argument. Thus we are forced to write the right
+    functions dynamically at runtime and define them using `exec`.
+    """
     # Create argument list for kernel
-    input_columns = ", ".join([f"input_col_{i}" for i in range(len(df._data))])
-    input_offsets = ", ".join([f"offset_{i}" for i in range(len(df._data))])
+    frame = supported_cols_from_frame(frame)
 
-    # Create argument list to pass to device function
-    args = ", ".join([f"masked_{i}" for i in range(len(df._data))])
-    user_udf_call = f"f_({args})"
+    input_columns = ", ".join([f"input_col_{i}" for i in range(len(frame))])
+    input_offsets = ", ".join([f"offset_{i}" for i in range(len(frame))])
+    extra_args = ", ".join([f"extra_arg_{i}" for i in range(len(args))])
 
     # Generate the initializers for each device function argument
     initializers = []
-    for i, col in enumerate(df._data.values()):
+    row_initializers = []
+    for i, (colname, col) in enumerate(frame.items()):
         idx = str(i)
         if col.mask is not None:
             template = masked_input_initializer_template
@@ -149,21 +292,26 @@ def _define_function(df, scalar_return=False):
 
         initializers.append(initializer)
 
-    masked_input_initializers = "\n".join(initializers)
+        row_initializer = row_initializer_template.format(
+            idx=idx, name=colname
+        )
+        row_initializers.append(row_initializer)
 
     # Incorporate all of the above into the kernel code template
     d = {
         "input_columns": input_columns,
         "input_offsets": input_offsets,
-        "masked_input_initializers": masked_input_initializers,
-        "user_udf_call": user_udf_call,
+        "extra_args": extra_args,
+        "masked_input_initializers": "\n".join(initializers),
+        "row_initializers": "\n".join(row_initializers),
+        "numba_rectype": row_type,  # from global
     }
 
     return kernel_template.format(**d)
 
 
 @annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python")
-def compile_or_get(df, f):
+def compile_or_get(frame, func, args):
     """
     Return a compiled kernel in terms of MaskedTypes that launches a
     kernel equivalent of `f` for the dtypes of `df`. The kernel uses
@@ -173,30 +321,49 @@ def compile_or_get(df, f):
     If the UDF has already been compiled for this requested dtypes,
     a cached version will be returned instead of running compilation.
 
+    CUDA kernels are void and do not return values. Thus, we need to
+    preallocate a column of the correct dtype and pass it in as one of
+    the kernel arguments. This creates a chicken-and-egg problem where
+    we need the column type to compile the kernel, but normally we would
+    be getting that type FROM compiling the kernel (and letting numba
+    determine it as a return value). As a workaround, we compile the UDF
+    itself outside the final kernel to invoke a full typing pass, which
+    unfortunately is difficult to do without running full compilation.
+    we then obtain the return type from that separate compilation and
+    use it to allocate an output column of the right dtype.
     """
 
     # check to see if we already compiled this function
-    frame_dtypes = tuple(col.dtype for col in df._data.values())
-    cache_key = (
-        *cudautils.make_cache_key(f, frame_dtypes),
-        *(col.mask is None for col in df._data.values()),
-    )
+    cache_key = generate_cache_key(frame, func)
     if precompiled.get(cache_key) is not None:
-        kernel, scalar_return_type = precompiled[cache_key]
-        return kernel, scalar_return_type
-
-    numba_return_type = get_udf_return_type(f, frame_dtypes)
-
-    _is_scalar_return = not isinstance(numba_return_type, MaskedType)
-    scalar_return_type = (
-        numba_return_type
-        if _is_scalar_return
-        else numba_return_type.value_type
-    )
-
-    sig = construct_signature(df, scalar_return_type)
-    f_ = cuda.jit(device=True)(f)
-
+        kernel, masked_or_scalar = precompiled[cache_key]
+        return kernel, masked_or_scalar
+
+    # precompile the user udf to get the right return type.
+    # could be a MaskedType or a scalar type.
+    scalar_return_type = get_udf_return_type(frame, func, args)
+
+    # get_udf_return_type will throw a TypingError if the user tries to use
+    # a field in the row containing an unsupported dtype, except in the
+    # edge case where all the function does is return that element:
+
+    # def f(row):
+    #    return row[<bad dtype key>]
+    # In this case numba is happy to return MaskedType(<bad dtype key>)
+    # because it relies on not finding overloaded operators for types to raise
+    # the exception, so we have to explicitly check for that case.
+    if isinstance(scalar_return_type, Poison):
+        raise TypeError(str(scalar_return_type))
+
+    # this is the signature for the final full kernel compilation
+    sig = construct_signature(frame, scalar_return_type, args)
+
+    # this row type is used within the kernel to pack up the column and
+    # mask data into the dict like data structure the user udf expects
+    np_field_types = np.dtype(list(supported_dtypes_from_frame(frame).items()))
+    row_type = get_frame_row_type(np_field_types)
+
+    f_ = cuda.jit(device=True)(func)
     # Dict of 'local' variables into which `_kernel` is defined
     local_exec_context = {}
     global_exec_context = {
@@ -205,16 +372,17 @@ def compile_or_get(df, f):
         "Masked": Masked,
         "mask_get": mask_get,
         "pack_return": pack_return,
+        "row_type": row_type,
     }
     exec(
-        _define_function(df, scalar_return=_is_scalar_return),
+        _define_function(frame, row_type, args),
         global_exec_context,
         local_exec_context,
     )
     # The python function definition representing the kernel
     _kernel = local_exec_context["_kernel"]
     kernel = cuda.jit(sig)(_kernel)
-    scalar_return_type = numpy_support.as_dtype(scalar_return_type)
-    precompiled[cache_key] = (kernel, scalar_return_type)
+    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    precompiled[cache_key] = (kernel, np_return_type)
 
-    return kernel, scalar_return_type
+    return kernel, np_return_type
diff --git a/python/cudf/cudf/core/udf/typing.py b/python/cudf/cudf/core/udf/typing.py
index 042d97db838..4b0f0bf1283 100644
--- a/python/cudf/cudf/core/udf/typing.py
+++ b/python/cudf/cudf/core/udf/typing.py
@@ -18,7 +18,19 @@
 from pandas._libs.missing import NAType as _NAType
 
 from cudf.core.udf import api
-from cudf.core.udf._ops import arith_ops, comparison_ops
+from cudf.core.udf._ops import (
+    arith_ops,
+    bitwise_ops,
+    comparison_ops,
+    unary_ops,
+)
+
+SUPPORTED_NUMBA_TYPES = (
+    types.Number,
+    types.Boolean,
+    types.NPDatetime,
+    types.NPTimedelta,
+)
 
 
 class MaskedType(types.Type):
@@ -30,9 +42,18 @@ class MaskedType(types.Type):
     def __init__(self, value):
         # MaskedType in Numba shall be parameterized
         # with a value type
-        if not isinstance(value, (types.Number, types.Boolean)):
-            raise TypeError("value_type must be a numeric scalar type")
-        self.value_type = value
+        if isinstance(value, SUPPORTED_NUMBA_TYPES):
+            self.value_type = value
+        else:
+            # Unsupported Dtype. Numba tends to print out the type info
+            # for whatever operands and operation failed to type and then
+            # output its own error message. Putting the message in the repr
+            # then is one way of getting the true cause to the user
+            self.value_type = types.Poison(
+                "\n\n\n Unsupported MaskedType. This is usually caused by "
+                "attempting to use a column of unsupported dtype in a UDF. "
+                f"Supported dtypes are {SUPPORTED_NUMBA_TYPES}"
+            )
         super().__init__(name=f"Masked{self.value_type}")
 
     def __hash__(self):
@@ -111,10 +132,18 @@ def typeof_masked(val, c):
 @cuda_decl_registry.register
 class MaskedConstructor(ConcreteTemplate):
     key = api.Masked
-
+    units = ["ns", "ms", "us", "s"]
+    datetime_cases = set(types.NPDatetime(u) for u in units)
+    timedelta_cases = set(types.NPTimedelta(u) for u in units)
     cases = [
         nb_signature(MaskedType(t), t, types.boolean)
-        for t in (types.integer_domain | types.real_domain)
+        for t in (
+            types.integer_domain
+            | types.real_domain
+            | datetime_cases
+            | timedelta_cases
+            | {types.boolean}
+        )
     ]
 
 
@@ -224,6 +253,15 @@ def generic(self, args, kws):
             return nb_signature(MaskedType(return_type), args[0], args[1])
 
 
+class MaskedScalarUnaryOp(AbstractTemplate):
+    def generic(self, args, kws):
+        if len(args) == 1 and isinstance(args[0], MaskedType):
+            return_type = self.context.resolve_function_type(
+                self.key, (args[0].value_type,), kws
+            ).return_type
+            return nb_signature(MaskedType(return_type), args[0])
+
+
 class MaskedScalarNullOp(AbstractTemplate):
     def generic(self, args, kws):
         """
@@ -246,14 +284,18 @@ def generic(self, args, kws):
         """
         # In the case of op(Masked, scalar), we resolve the type between
         # the Masked value_type and the scalar's type directly
+        to_resolve_types = None
         if isinstance(args[0], MaskedType) and isinstance(
-            args[1], (types.Number, types.Boolean)
+            args[1], SUPPORTED_NUMBA_TYPES
         ):
             to_resolve_types = (args[0].value_type, args[1])
-        elif isinstance(args[0], (types.Number, types.Boolean)) and isinstance(
+        elif isinstance(args[0], SUPPORTED_NUMBA_TYPES) and isinstance(
             args[1], MaskedType
         ):
             to_resolve_types = (args[1].value_type, args[0])
+        else:
+            # fail typing
+            return None
         return_type = self.context.resolve_function_type(
             self.key, to_resolve_types, kws
         ).return_type
@@ -298,14 +340,17 @@ def generic(self, args, kws):
         if isinstance(args[0], MaskedType):
             # MaskedType(dtype, valid) -> MaskedType(dtype, valid)
             return nb_signature(args[0], args[0])
-        elif isinstance(args[0], (types.Number, types.Boolean)):
+        elif isinstance(args[0], SUPPORTED_NUMBA_TYPES):
             # scalar_type -> MaskedType(scalar_type, True)
             return_type = MaskedType(args[0])
             return nb_signature(return_type, args[0])
 
 
-for op in arith_ops + comparison_ops:
+for binary_op in arith_ops + bitwise_ops + comparison_ops:
     # Every op shares the same typing class
-    cuda_decl_registry.register_global(op)(MaskedScalarArithOp)
-    cuda_decl_registry.register_global(op)(MaskedScalarNullOp)
-    cuda_decl_registry.register_global(op)(MaskedScalarScalarOp)
+    cuda_decl_registry.register_global(binary_op)(MaskedScalarArithOp)
+    cuda_decl_registry.register_global(binary_op)(MaskedScalarNullOp)
+    cuda_decl_registry.register_global(binary_op)(MaskedScalarScalarOp)
+
+for unary_op in unary_ops:
+    cuda_decl_registry.register_global(unary_op)(MaskedScalarUnaryOp)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index a3d96cee051..680cfca19eb 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -288,6 +288,10 @@ def apply(self, func, *args, **kwargs):
         ----------
         func : function
             A user defined function that takes an 1D array as input
+        args : tuple
+            unsupported.
+        kwargs
+            unsupported
 
         See also
         --------
@@ -298,6 +302,27 @@ def apply(self, func, *args, **kwargs):
         -----
         See notes of the :meth:`cudf.Series.applymap`
 
+        Example
+        -------
+
+        >>> import cudf
+        >>> def count_if_gt_3(window):
+        ...     count = 0
+        ...     for i in window:
+        ...             if i > 3:
+        ...                     count += 1
+        ...     return count
+        ...
+        >>> s = cudf.Series([0, 1.1, 5.8, 3.1, 6.2, 2.0, 1.5])
+        >>> s.rolling(3, min_periods=1).apply(count_if_gt_3)
+        0    0
+        1    0
+        2    1
+        3    2
+        4    3
+        5    2
+        6    1
+        dtype: int64
         """
         has_nulls = False
         if isinstance(self.obj, cudf.Series):
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index b568c108191..2341a5c23b9 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -18,7 +18,7 @@ def timeseries(
     nulls_frequency=0,
     seed=None,
 ):
-    """ Create timeseries dataframe with random data
+    """Create timeseries dataframe with random data
 
     Parameters
     ----------
@@ -81,7 +81,7 @@ def timeseries(
 
 
 def randomdata(nrows=10, dtypes=None, seed=None):
-    """ Create a dataframe with random data
+    """Create a dataframe with random data
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 565a109eb79..01f1fdf9020 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -46,6 +46,7 @@ def read_csv(
     na_filter=True,
     prefix=None,
     index_col=None,
+    use_python_file_object=True,
     **kwargs,
 ):
     """{docstring}"""
@@ -58,21 +59,21 @@ def read_csv(
             "`read_csv` does not yet support reading multiple files"
         )
 
+    # Only need to pass byte_ranges to get_filepath_or_buffer
+    # if `use_python_file_object=False`
+    byte_ranges = None
+    if not use_python_file_object and byte_range:
+        byte_ranges = [byte_range]
+
     filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
         iotypes=(BytesIO, StringIO, NativeFile),
-        byte_ranges=[byte_range] if byte_range else None,
-        clip_local_buffer=True if byte_range else False,
+        byte_ranges=byte_ranges,
+        use_python_file_object=use_python_file_object,
         **kwargs,
     )
 
-    # Adjust byte_range for clipped local buffers
-    use_byte_range = byte_range
-    if byte_range and isinstance(filepath_or_buffer, BytesIO):
-        if byte_range[1] == filepath_or_buffer.getbuffer().nbytes:
-            use_byte_range = (0, byte_range[1])
-
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
@@ -100,7 +101,7 @@ def read_csv(
         true_values=true_values,
         false_values=false_values,
         nrows=nrows,
-        byte_range=use_byte_range,
+        byte_range=byte_range,
         skip_blank_lines=skip_blank_lines,
         parse_dates=parse_dates,
         comment=comment,
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 9d97bee0396..00a2cb4cee2 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,10 +1,9 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
+
+import cudf
 from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
-from cudf.core.dataframe import DataFrame
-from cudf.core.index import BaseIndex
-from cudf.core.series import Series
 from cudf.utils import ioutils
 
 
@@ -38,9 +37,9 @@ def from_dlpack(pycapsule_obj):
     data, _ = libdlpack.from_dlpack(pycapsule_obj)
 
     if len(data) == 1:
-        return Series._from_data(data)
+        return cudf.Series._from_data(data)
     else:
-        return DataFrame._from_data(data)
+        return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
@@ -71,14 +70,25 @@ def to_dlpack(cudf_obj):
     if len(cudf_obj) == 0:
         raise ValueError("Cannot create DLPack tensor of 0 size")
 
-    if isinstance(cudf_obj, (DataFrame, Series, BaseIndex)):
-        gdf_cols = cudf_obj
+    if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
+        gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf_cols = cudf_obj.as_frame()
+        gdf = cudf_obj.as_frame()
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "
             "to DLPack tensor"
         )
 
-    return libdlpack.to_dlpack(gdf_cols)
+    if any(
+        not cudf.api.types._is_non_decimal_numeric_dtype(col.dtype)
+        for col in gdf._data.columns
+    ):
+        raise TypeError("non-numeric data not yet supported")
+
+    dtype = cudf.utils.dtypes.find_common_type(
+        [col.dtype for col in gdf._data.columns]
+    )
+    gdf = gdf.astype(dtype)
+
+    return libdlpack.to_dlpack(gdf)
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 3aa672223c9..ecb1b0cd185 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -287,6 +287,7 @@ def read_orc(
     use_index=True,
     decimal_cols_as_float=None,
     timestamp_type=None,
+    use_python_file_object=True,
     **kwargs,
 ):
     """{docstring}"""
@@ -321,7 +322,10 @@ def read_orc(
             source = fs.sep.join([source, "*.orc"])
 
         tmp_source, compression = ioutils.get_filepath_or_buffer(
-            path_or_data=source, compression=None, **kwargs,
+            path_or_data=source,
+            compression=None,
+            use_python_file_object=use_python_file_object,
+            **kwargs,
         )
         if compression is not None:
             raise ValueError(
@@ -387,7 +391,16 @@ def read_orc_stripe(orc_file, stripe, columns):
 
 
 @ioutils.doc_to_orc()
-def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
+def to_orc(
+    df,
+    fname,
+    compression=None,
+    enable_statistics=True,
+    stripe_size_bytes=None,
+    stripe_size_rows=None,
+    row_index_stride=None,
+    **kwargs,
+):
     """{docstring}"""
 
     for col in df._data.columns:
@@ -414,9 +427,25 @@ def to_orc(df, fname, compression=None, enable_statistics=True, **kwargs):
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            liborc.write_orc(df, file_obj, compression, enable_statistics)
+            liborc.write_orc(
+                df,
+                file_obj,
+                compression,
+                enable_statistics,
+                stripe_size_bytes,
+                stripe_size_rows,
+                row_index_stride,
+            )
     else:
-        liborc.write_orc(df, path_or_buf, compression, enable_statistics)
+        liborc.write_orc(
+            df,
+            path_or_buf,
+            compression,
+            enable_statistics,
+            stripe_size_bytes,
+            stripe_size_rows,
+            row_index_stride,
+        )
 
 
 ORCWriter = liborc.ORCWriter
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index b101835e626..cc5aec36853 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -68,7 +68,7 @@ def count_zero(arr):
 
 
 def assert_eq(left, right, **kwargs):
-    """ Assert that two cudf-like things are equivalent
+    """Assert that two cudf-like things are equivalent
 
     This equality test works for pandas/cudf dataframes/series/indexes/scalars
     in the same way, and so makes it easier to perform parametrized testing
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index afe21201b7e..f4a80c60ddf 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -368,6 +368,22 @@ def rand_dataframe(
                     dtype=dtype,
                 )
             )
+        elif dtype == "decimal32":
+            max_precision = meta.get(
+                "max_precision", cudf.Decimal32Dtype.MAX_PRECISION
+            )
+            precision = np.random.randint(1, max_precision)
+            scale = np.random.randint(0, precision)
+            dtype = cudf.Decimal32Dtype(precision=precision, scale=scale)
+            column_params.append(
+                ColumnParameters(
+                    cardinality=cardinality,
+                    null_frequency=null_frequency,
+                    generator=decimal_generator(dtype=dtype, size=cardinality),
+                    is_sorted=False,
+                    dtype=dtype,
+                )
+            )
         elif dtype == "category":
             column_params.append(
                 ColumnParameters(
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc
new file mode 100644
index 00000000000..2e96e40cda7
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.int16.rle.size.orc differ
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 50fd27f2752..542dcd9301c 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -668,6 +668,8 @@ def test_boolean_scalar_binop(op):
     "rmod",
     "pow",
     "rpow",
+    "div",
+    "divide",
     "floordiv",
     "rfloordiv",
     "truediv",
@@ -833,6 +835,42 @@ def gen_df():
     utils.assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("func", _operators_comparison)
+@pytest.mark.parametrize("nulls", _nulls)
+@pytest.mark.parametrize("other", ["df", "scalar"])
+def test_logical_operator_func_dataframe(func, nulls, other):
+    np.random.seed(0)
+    num_rows = 100
+    num_cols = 3
+
+    def gen_df():
+        pdf = pd.DataFrame()
+        from string import ascii_lowercase
+
+        cols = np.random.choice(num_cols + 5, num_cols, replace=False)
+
+        for i in range(num_cols):
+            colname = ascii_lowercase[cols[i]]
+            data = utils.gen_rand("float64", num_rows) * 10000
+            if nulls == "some":
+                idx = np.random.choice(
+                    num_rows, size=int(num_rows / 2), replace=False
+                )
+                data[idx] = np.nan
+            pdf[colname] = data
+        return pdf
+
+    pdf1 = gen_df()
+    pdf2 = gen_df() if other == "df" else 59.0
+    gdf1 = cudf.DataFrame.from_pandas(pdf1)
+    gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0
+
+    got = getattr(gdf1, func)(gdf2)
+    expect = getattr(pdf1, func)(pdf2)[list(got._data)]
+
+    utils.assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("func", _operators_arithmetic + _operators_comparison)
 @pytest.mark.parametrize("rhs", [0, 1, 2, 128])
 def test_binop_bool_uint(func, rhs):
@@ -2875,7 +2913,7 @@ def set_null_cases(column_l, column_r, case):
     "lcol,rcol,ans,case", generate_test_null_equals_columnops_data()
 )
 def test_null_equals_columnops(lcol, rcol, ans, case):
-    assert lcol._null_equals(rcol).all() == ans
+    assert lcol.equals(rcol).all() == ans
 
 
 def test_add_series_to_dataframe():
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 3c1ff4c968e..bb96f3c4290 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -229,7 +229,11 @@ def test_concat_multiindex_dataframe():
         pd.concat([pdg1, pdg2]),
         check_index_type=True,
     )
-    assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
+    assert_eq(
+        gd.concat([gdg1, gdg2], axis=1),
+        pd.concat([pdg1, pdg2], axis=1),
+        check_index_type=True,
+    )
 
 
 def test_concat_multiindex_series():
@@ -269,7 +273,11 @@ def test_concat_multiindex_dataframe_and_series():
     pdg2.name = "a"
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
-    assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
+    assert_eq(
+        gd.concat([gdg1, gdg2], axis=1),
+        pd.concat([pdg1, pdg2], axis=1),
+        check_index_type=True,
+    )
 
 
 def test_concat_multiindex_series_and_dataframe():
@@ -288,7 +296,11 @@ def test_concat_multiindex_series_and_dataframe():
     pdg1.name = "a"
     gdg1 = gd.from_pandas(pdg1)
     gdg2 = gd.from_pandas(pdg2)
-    assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1))
+    assert_eq(
+        gd.concat([gdg1, gdg2], axis=1),
+        pd.concat([pdg1, pdg2], axis=1),
+        check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("myindex", ["a", "b"])
@@ -328,7 +340,9 @@ def test_pandas_concat_compatibility_axis1():
     expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1)
     got = gd.concat([d1, d2, d3, d4, d5], axis=1)
 
-    assert_eq(got, expect)
+    assert_eq(
+        got, expect, check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize("index", [[0, 1, 2], [2, 1, 0], [5, 9, 10]])
@@ -350,7 +364,7 @@ def test_pandas_concat_compatibility_axis1_overlap(index, names, data):
     ps2 = s2.to_pandas()
     got = gd.concat([s1, s2], axis=1)
     expect = pd.concat([ps1, ps2], axis=1)
-    assert_eq(got, expect)
+    assert_eq(got, expect, check_index_type=True)
 
 
 def test_pandas_concat_compatibility_axis1_eq_index():
@@ -640,10 +654,12 @@ def test_concat_dataframe_with_multiIndex(df1, df2):
     pdf1 = gdf1.to_pandas()
     pdf2 = gdf2.to_pandas()
 
-    expected = gd.concat([gdf1, gdf2], axis=1)
-    actual = pd.concat([pdf1, pdf2], axis=1)
+    actual = gd.concat([gdf1, gdf2], axis=1)
+    expected = pd.concat([pdf1, pdf2], axis=1)
 
-    assert_eq(expected, actual)
+    assert_eq(
+        expected, actual, check_index_type=True,
+    )
 
 
 @pytest.mark.parametrize(
@@ -761,18 +777,23 @@ def test_concat_join_axis_1_dup_error(objs):
 def test_concat_join_axis_1(objs, ignore_index, sort, join, axis):
     # no duplicate columns
     gpu_objs = [gd.from_pandas(o) for o in objs]
-
+    expected = pd.concat(
+        objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis
+    )
+    actual = gd.concat(
+        gpu_objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis,
+    )
+    # TODO: Remove special handling below
+    # after following bug from pandas is fixed:
+    # https://github.com/pandas-dev/pandas/issues/43584
     assert_eq(
-        pd.concat(
-            objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis
-        ),
-        gd.concat(
-            gpu_objs,
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
+        expected,
+        actual,
+        check_index_type=False
+        if sort
+        and isinstance(expected.index, pd.Int64Index)
+        and isinstance(actual.index, gd.RangeIndex)
+        else True,
     )
 
 
@@ -833,14 +854,23 @@ def test_concat_join_one_df(ignore_index, sort, join, axis):
     )
 
     gdf1 = gd.from_pandas(pdf1)
-
+    expected = pd.concat(
+        [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis
+    )
+    actual = gd.concat(
+        [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis
+    )
+    # TODO: Remove special handling below
+    # after following bug from pandas is fixed:
+    # https://github.com/pandas-dev/pandas/issues/43584
     assert_eq(
-        pd.concat(
-            [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis
-        ),
-        gd.concat(
-            [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis
-        ),
+        expected,
+        actual,
+        check_index_type=False
+        if sort
+        and isinstance(expected.index, pd.Int64Index)
+        and isinstance(actual.index, gd.RangeIndex)
+        else True,
     )
 
 
@@ -870,21 +900,34 @@ def test_concat_join_no_overlapping_columns(
 ):
     gdf1 = gd.from_pandas(pdf1)
     gdf2 = gd.from_pandas(pdf2)
+
+    expected = pd.concat(
+        [pdf1, pdf2],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
+    )
+    actual = gd.concat(
+        [gdf1, gdf2],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
+    )
+
+    # TODO: Remove special handling below
+    # after following bug from pandas is fixed:
+    # https://github.com/pandas-dev/pandas/issues/43584
     assert_eq(
-        pd.concat(
-            [pdf1, pdf2],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
-        gd.concat(
-            [gdf1, gdf2],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
+        expected,
+        actual,
+        check_index_type=False
+        if sort
+        and axis == 1
+        and isinstance(expected.index, pd.Int64Index)
+        and isinstance(actual.index, gd.RangeIndex)
+        else True,
     )
 
 
@@ -1013,22 +1056,24 @@ def test_concat_join_no_overlapping_columns_empty_df_basic(
     gdf6 = gd.from_pandas(pdf6)
     gdf_empty = gd.from_pandas(pdf_empty)
 
-    assert_eq(
-        pd.concat(
-            [pdf6, pdf_empty],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ).reset_index(drop=True),
-        gd.concat(
-            [gdf6, gdf_empty],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
+    expected = pd.concat(
+        [pdf6, pdf_empty],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
+    )
+    actual = gd.concat(
+        [gdf6, gdf_empty],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
     )
+    # TODO: change `check_index_type` to `True`
+    # after following bug from pandas is fixed:
+    # https://github.com/pandas-dev/pandas/issues/43584
+    assert_eq(expected, actual, check_index_type=False)
 
 
 @pytest.mark.parametrize("ignore_index", [True, False])
@@ -1046,21 +1091,32 @@ def test_concat_join_series(ignore_index, sort, join, axis):
     ps3 = s3.to_pandas()
     ps4 = s4.to_pandas()
 
+    expected = pd.concat(
+        [ps1, ps2, ps3, ps4],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
+    )
+    actual = gd.concat(
+        [s1, s2, s3, s4],
+        sort=sort,
+        join=join,
+        ignore_index=ignore_index,
+        axis=axis,
+    )
+
+    # TODO: Remove special handling below
+    # after following bug from pandas is fixed:
+    # https://github.com/pandas-dev/pandas/issues/43584
     assert_eq(
-        gd.concat(
-            [s1, s2, s3, s4],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
-        pd.concat(
-            [ps1, ps2, ps3, ps4],
-            sort=sort,
-            join=join,
-            ignore_index=ignore_index,
-            axis=axis,
-        ),
+        expected,
+        actual,
+        check_index_type=False
+        if sort
+        and isinstance(expected.index, pd.Int64Index)
+        and isinstance(actual.index, gd.RangeIndex)
+        else True,
     )
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 5a839507182..2f329766936 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1103,27 +1103,52 @@ def test_assign():
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-def test_dataframe_hash_columns(nrows):
+@pytest.mark.parametrize("method", ["murmur3", "md5"])
+def test_dataframe_hash_columns(nrows, method):
     gdf = cudf.DataFrame()
     data = np.asarray(range(nrows))
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
-    out = gdf.hash_columns(["a", "b"])
-    assert isinstance(out, cupy.ndarray)
+    with pytest.warns(FutureWarning):
+        out = gdf.hash_columns(["a", "b"])
+    assert isinstance(out, cudf.Series)
     assert len(out) == nrows
     assert out.dtype == np.int32
 
     # Check default
-    out_all = gdf.hash_columns()
-    np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all))
+    with pytest.warns(FutureWarning):
+        out_all = gdf.hash_columns()
+    assert_eq(out, out_all)
 
     # Check single column
-    out_one = cupy.asnumpy(gdf.hash_columns(["a"]))
+    with pytest.warns(FutureWarning):
+        out_one = gdf.hash_columns(["a"], method=method)
     # First matches last
-    assert out_one[0] == out_one[-1]
+    assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()
-    np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one)
+    assert_eq(gdf["a"].hash_values(method=method), out_one)
+
+
+@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
+@pytest.mark.parametrize("method", ["murmur3", "md5"])
+def test_dataframe_hash_values(nrows, method):
+    gdf = cudf.DataFrame()
+    data = np.asarray(range(nrows))
+    data[0] = data[-1]  # make first and last the same
+    gdf["a"] = data
+    gdf["b"] = gdf.a + 100
+    out = gdf.hash_values()
+    assert isinstance(out, cudf.Series)
+    assert len(out) == nrows
+    assert out.dtype == np.int32
+
+    # Check single column
+    out_one = gdf[["a"]].hash_values(method=method)
+    # First matches last
+    assert out_one.iloc[0] == out_one.iloc[-1]
+    # Equivalent to the cudf.Series.hash_values()
+    assert_eq(gdf["a"].hash_values(method=method), out_one)
 
 
 @pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
@@ -2067,6 +2092,24 @@ def test_unaryops_df(pdf, gdf, unaryop):
     assert_eq(d, g)
 
 
+@pytest.mark.parametrize("unary_func", ["abs", "floor", "ceil"])
+def test_unary_func_df(pdf, unary_func):
+    np.random.seed(0)
+    disturbance = pd.Series(np.random.rand(10))
+    pdf = pdf - 5 + disturbance
+    d = pdf.apply(getattr(np, unary_func))
+    g = getattr(cudf.from_pandas(pdf), unary_func)()
+    assert_eq(d, g)
+
+
+def test_scale_df(gdf):
+    got = (gdf - 5).scale()
+    expect = cudf.DataFrame(
+        {"x": np.linspace(0.0, 1.0, 10), "y": np.linspace(0.0, 1.0, 10)}
+    )
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "func",
     [
@@ -2199,16 +2242,34 @@ def test_series_hash_encode(nrows):
     s = cudf.Series(data, name=1)
     num_features = 1000
 
-    encoded_series = s.hash_encode(num_features)
+    with pytest.warns(FutureWarning):
+        encoded_series = s.hash_encode(num_features)
     assert isinstance(encoded_series, cudf.Series)
     enc_arr = encoded_series.to_numpy()
     assert np.all(enc_arr >= 0)
     assert np.max(enc_arr) < num_features
 
-    enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_numpy()
+    with pytest.warns(FutureWarning):
+        enc_with_name_arr = s.hash_encode(
+            num_features, use_name=True
+        ).to_numpy()
     assert enc_with_name_arr[0] != enc_arr[0]
 
 
+def test_series_hash_encode_reproducible_results():
+    # Regression test to ensure that hash_encode outputs are reproducible
+    data = cudf.Series([0, 1, 2])
+    with pytest.warns(FutureWarning):
+        hash_result = data.hash_encode(stop=2 ** 16, use_name=False)
+    expected_result = cudf.Series([42165, 55037, 7341])
+    assert_eq(hash_result, expected_result)
+
+    with pytest.warns(FutureWarning):
+        hash_result_with_name = data.hash_encode(stop=2 ** 16, use_name=True)
+    expected_result_with_name = cudf.Series([36137, 39649, 58673])
+    assert_eq(hash_result_with_name, expected_result_with_name)
+
+
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
 
@@ -7918,7 +7979,9 @@ def test_dataframe_mode(df, numeric_only, dropna):
     assert_eq(expected, actual, check_dtype=False)
 
 
-@pytest.mark.parametrize("lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0)])
+@pytest.mark.parametrize(
+    "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")]
+)
 def test_equals_names(lhs, rhs):
     lhs = cudf.DataFrame({lhs: [1, 2]})
     rhs = cudf.DataFrame({rhs: [1, 2]})
@@ -8671,12 +8734,12 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode):
         (
             cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
             True,
-            cudf.Series([1, 2, 0], dtype="int32"),
+            cupy.array([1, 2, 0], dtype="int32"),
         ),
         (
             cudf.DataFrame({"a": [10, 0, 2], "b": [-10, 10, 1]}),
             False,
-            cudf.Series([0, 2, 1], dtype="int32"),
+            cupy.array([0, 2, 1], dtype="int32"),
         ),
     ],
 )
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 3bbac217283..deb10855802 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1433,6 +1433,141 @@ def test_is_month_start(data, dtype):
     assert_eq(expect, got)
 
 
+##################################################################
+#                       Date Range Tests                         #
+##################################################################
+
+date_range_test_dates_start = [
+    "2000-02-13 08:41:06",  # leap year
+    "1996-11-21 04:05:30",  # non leap year
+    "1970-01-01 00:00:00",  # unix epoch time 0
+    "1831-05-08 15:23:21",
+]
+date_range_test_dates_end = [
+    "2000-02-13 08:41:06",  # leap year
+    "1996-11-21 04:05:30",  # non leap year
+    "1970-01-01 00:00:00",  # unix epoch time 0
+    "1831-05-08 15:23:21",
+]
+date_range_test_periods = [1, 10, 100]
+date_range_test_freq = [
+    {"months": 3, "years": 1},
+    {"hours": 10, "days": 57, "nanoseconds": 3},
+    "83D",
+    "17h",
+    "-680T",
+    "110546s",
+    pytest.param(
+        "110546789L",
+        marks=pytest.mark.xfail(
+            True,
+            reason="Pandas DateOffset ignores milliseconds. "
+            "https://github.com/pandas-dev/pandas/issues/43371",
+        ),
+    ),
+    "110546789248U",
+]
+
+
+@pytest.fixture(params=date_range_test_dates_start[:])
+def start(request):
+    return request.param
+
+
+@pytest.fixture(params=date_range_test_dates_end[:])
+def end(request):
+    return request.param
+
+
+@pytest.fixture(params=date_range_test_periods[:])
+def periods(request):
+    return request.param
+
+
+@pytest.fixture(params=date_range_test_freq[:])
+def freq(request):
+    return request.param
+
+
+def test_date_range_start_end_periods(start, end, periods):
+    expect = pd.date_range(start=start, end=end, periods=periods, name="a")
+    got = cudf.date_range(start=start, end=end, periods=periods, name="a")
+
+    np.testing.assert_allclose(
+        expect.to_numpy().astype("int64"),
+        got.to_pandas().to_numpy().astype("int64"),
+    )
+
+
+def test_date_range_start_end_freq(start, end, freq):
+    if isinstance(freq, str):
+        _gfreq = _pfreq = freq
+    else:
+        _gfreq = cudf.DateOffset(**freq)
+        _pfreq = pd.DateOffset(**freq)
+
+    expect = pd.date_range(start=start, end=end, freq=_pfreq, name="a")
+    got = cudf.date_range(start=start, end=end, freq=_gfreq, name="a")
+
+    np.testing.assert_allclose(
+        expect.to_numpy().astype("int64"),
+        got.to_pandas().to_numpy().astype("int64"),
+    )
+
+
+def test_date_range_start_freq_periods(start, freq, periods):
+    if isinstance(freq, str):
+        _gfreq = _pfreq = freq
+    else:
+        _gfreq = cudf.DateOffset(**freq)
+        _pfreq = pd.DateOffset(**freq)
+
+    expect = pd.date_range(start=start, periods=periods, freq=_pfreq, name="a")
+    got = cudf.date_range(start=start, periods=periods, freq=_gfreq, name="a")
+
+    np.testing.assert_allclose(
+        expect.to_numpy().astype("int64"),
+        got.to_pandas().to_numpy().astype("int64"),
+    )
+
+
+def test_date_range_end_freq_periods(end, freq, periods):
+    if isinstance(freq, str):
+        _gfreq = _pfreq = freq
+    else:
+        _gfreq = cudf.DateOffset(**freq)
+        _pfreq = pd.DateOffset(**freq)
+
+    expect = pd.date_range(end=end, periods=periods, freq=_pfreq, name="a")
+    got = cudf.date_range(end=end, periods=periods, freq=_gfreq, name="a")
+
+    np.testing.assert_allclose(
+        expect.to_numpy().astype("int64"),
+        got.to_pandas().to_numpy().astype("int64"),
+    )
+
+
+def test_date_range_raise_overflow():
+    # Fixed offset
+    start = np.datetime64(np.iinfo("int64").max, "ns")
+    periods = 2
+    freq = cudf.DateOffset(nanoseconds=1)
+    with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
+        cudf.date_range(start=start, periods=periods, freq=freq)
+
+    # Non-fixed offset
+    start = np.datetime64(np.iinfo("int64").max, "ns")
+    periods = 2
+    freq = cudf.DateOffset(months=1)
+    with pytest.raises(pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
+        cudf.date_range(start=start, periods=periods, freq=freq)
+
+
+##################################################################
+#                    End of Date Range Test                      #
+##################################################################
+
+
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 4b26e2c13bc..b686d4d3930 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -100,34 +100,12 @@ def test_to_dlpack_index(data_1d):
         assert str(type(dlt)) == "<class 'PyCapsule'>"
 
 
-def test_to_dlpack_column(data_1d):
-    expectation = data_size_expectation_builder(data_1d)
-
-    with expectation:
-        gs = cudf.Series(data_1d, nan_as_null=False)
-        dlt = gs._column.to_dlpack()
-
-        # PyCapsules are a C-API thing so couldn't come up with a better way
-        assert str(type(dlt)) == "<class 'PyCapsule'>"
-
-
-def test_to_dlpack_column_null(data_1d):
-    expectation = data_size_expectation_builder(data_1d, nan_null_param=True)
-
-    with expectation:
-        gs = cudf.Series(data_1d, nan_as_null=True)
-        dlt = gs._column.to_dlpack()
-
-        # PyCapsules are a C-API thing so couldn't come up with a better way
-        assert str(type(dlt)) == "<class 'PyCapsule'>"
-
-
 def test_to_dlpack_cupy_1d(data_1d):
     expectation = data_size_expectation_builder(data_1d, False)
     with expectation:
         gs = cudf.Series(data_1d, nan_as_null=False)
         cudf_host_array = gs.to_numpy(na_value=np.nan)
-        dlt = gs._column.to_dlpack()
+        dlt = gs.to_dlpack()
 
         cupy_array = cupy.fromDlpack(dlt)
         cupy_host_array = cupy_array.get()
@@ -191,9 +169,21 @@ def test_to_dlpack_cupy_1d_null(data_1d):
     with expectation:
         gs = cudf.Series(data_1d)
         cudf_host_array = gs.to_numpy(na_value=np.nan)
-        dlt = gs._column.to_dlpack()
+        dlt = gs.to_dlpack()
 
         cupy_array = cupy.fromDlpack(dlt)
         cupy_host_array = cupy_array.get()
 
         assert_eq(cudf_host_array, cupy_host_array)
+
+
+def test_to_dlpack_mixed_dtypes():
+    df = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [10.32, 0.4, -0.2, -1000.32]})
+
+    cudf_host_array = df.to_numpy()
+    dlt = df.to_dlpack()
+
+    cupy_array = cupy.fromDlpack(dlt)
+    cupy_host_array = cupy_array.get()
+
+    assert_eq(cudf_host_array, cupy_host_array)
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 03cd6c6f5cb..db53529b22f 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -29,7 +29,7 @@ def pdf(scope="module"):
     return df
 
 
-def test_read_csv(pdf, monkeypatch):
+def test_read_csv(pdf, monkeypatch, tmpdir):
     # Write to buffer
     fpath = TEST_BUCKET + "test_csv_reader.csv"
     buffer = pdf.to_csv(index=False)
@@ -42,8 +42,21 @@ def mock_size(*args):
 
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open)
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size)
-    got = cudf.read_csv("gcs://{}".format(fpath))
 
+    # Test read from explicit path.
+    # Since we are monkey-patching, we cannot use
+    # use_python_file_object=True, because the pyarrow
+    # `open_input_file` command will fail (since it doesn't
+    # use the monkey-patched `open` definition)
+    got = cudf.read_csv("gcs://{}".format(fpath), use_python_file_object=False)
+    assert_eq(pdf, got)
+
+    # AbstractBufferedFile -> PythonFile conversion
+    # will work fine with the monkey-patched FS if we
+    # pass in an fsspec file object
+    fs = gcsfs.core.GCSFileSystem()
+    with fs.open("gcs://{}".format(fpath)) as f:
+        got = cudf.read_csv(f)
     assert_eq(pdf, got)
 
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 338e10ebe30..77d0b766e97 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1913,6 +1913,89 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
     )
 
 
+@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+def test_groupby_diff_row(nelem, shift_perc, direction):
+    pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"])
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    expected = pdf.groupby(["x", "y"]).diff(periods=n_shift)
+    got = gdf.groupby(["x", "y"]).diff(periods=n_shift)
+
+    assert_groupby_results_equal(
+        expected[["val", "val2"]], got[["val", "val2"]]
+    )
+
+
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
+@pytest.mark.parametrize("direction", [1, -1])
+def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    pdf = t.to_pandas()
+    gdf = cudf.from_pandas(pdf)
+    n_shift = int(nelem * shift_perc) * direction
+
+    expected = pdf.groupby(["0"]).diff(periods=n_shift)
+    got = gdf.groupby(["0"]).diff(periods=n_shift)
+
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4", "5"]], got[["1", "2", "3", "4", "5"]]
+    )
+
+
+@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
+def test_groupby_diff_row_zero_shift(nelem):
+    t = rand_dataframe(
+        dtypes_meta=[
+            {"dtype": "int64", "null_frequency": 0, "cardinality": 10},
+            {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
+            {"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
+            {
+                "dtype": "datetime64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+            {
+                "dtype": "timedelta64[ns]",
+                "null_frequency": 0.4,
+                "cardinality": 10,
+            },
+        ],
+        rows=nelem,
+        use_threads=False,
+    )
+    gdf = cudf.from_pandas(t.to_pandas())
+
+    expected = gdf
+    got = gdf.groupby(["0"]).shift(periods=0)
+
+    assert_groupby_results_equal(
+        expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
+    )
+
+
 # TODO: test for category columns when cudf.Scalar supports category type
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
 def test_groupby_fillna_multi_value(nelem):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 4ae86dc1cfc..c6cf7c4e6f5 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -43,7 +43,6 @@ def test_df_set_index_from_series():
     df2 = df.set_index(df["b"])
     assert list(df2.columns) == ["a", "b"]
     sliced_strided = df2.loc[2:6]
-    print(sliced_strided)
     assert len(sliced_strided) == 3
     assert list(sliced_strided.index.values) == [2, 4, 6]
 
@@ -55,11 +54,9 @@ def test_df_set_index_from_name():
 
     # Check set_index(column_name)
     df2 = df.set_index("b")
-    print(df2)
     # 1 less column because 'b' is used as index
     assert list(df2.columns) == ["a"]
     sliced_strided = df2.loc[2:6]
-    print(sliced_strided)
     assert len(sliced_strided) == 3
     assert list(sliced_strided.index.values) == [2, 4, 6]
 
@@ -95,8 +92,7 @@ def test_index_find_label_range_genericindex():
 
 
 def test_index_find_label_range_rangeindex():
-    """Cudf specific
-    """
+    """Cudf specific"""
     # step > 0
     # 3, 8, 13, 18
     ridx = RangeIndex(3, 20, 5)
@@ -347,8 +343,7 @@ def test_index_copy_string(name, dtype, deep=True):
     NUMERIC_TYPES + ["datetime64[ns]", "timedelta64[ns]"] + OTHER_TYPES,
 )
 def test_index_copy_integer(name, dtype, deep=True):
-    """Test for NumericIndex Copy Casts
-    """
+    """Test for NumericIndex Copy Casts"""
     cidx = cudf.Int64Index([1, 2, 3])
     pidx = cidx.to_pandas()
 
@@ -361,8 +356,7 @@ def test_index_copy_integer(name, dtype, deep=True):
 @pytest.mark.parametrize("name", ["x"])
 @pytest.mark.parametrize("dtype", SIGNED_TYPES)
 def test_index_copy_float(name, dtype, deep=True):
-    """Test for NumericIndex Copy Casts
-    """
+    """Test for NumericIndex Copy Casts"""
     cidx = cudf.Float64Index([1.0, 2.0, 3.0])
     pidx = cidx.to_pandas()
 
@@ -2378,3 +2372,101 @@ def test_range_index_concat(objs):
     for obj in objs[1:]:
         expected = expected.append(obj)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx1, idx2",
+    [
+        (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)),
+        (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)),
+        (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)),
+        (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)),
+        (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)),
+        (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)),
+        (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)),
+        (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)),
+        (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)),
+        (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")),
+        (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])),
+        (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
+        (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
+        (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
+    ],
+)
+@pytest.mark.parametrize("sort", [None, False])
+def test_union_index(idx1, idx2, sort):
+    expected = idx1.union(idx2, sort=sort)
+
+    idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1
+    idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2
+
+    actual = idx1.union(idx2, sort=sort)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx1, idx2",
+    [
+        (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)),
+        (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)),
+        (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")),
+        (pd.Index([0, 1, 2, 30], name="a"), pd.Index([30, 0, 90, 100])),
+        (pd.Index([0, 1, 2, 30], name="a"), [90, 100]),
+        (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])),
+        (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])),
+        (
+            pd.Index(["a", "b", "c", "d", "c"]),
+            pd.Index(["a", "b", "c", "d", "c"]),
+        ),
+        (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])),
+        (pd.Index([True, False, True, True]), pd.Index([True, True])),
+    ],
+)
+@pytest.mark.parametrize("sort", [None, False])
+def test_intersection_index(idx1, idx2, sort):
+
+    expected = idx1.intersection(idx2, sort=sort)
+
+    idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1
+    idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2
+
+    actual = idx1.intersection(idx2, sort=sort)
+
+    assert_eq(expected, actual, exact=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1, 2, 3],
+        ["a", "v", "d"],
+        [234.243, 2432.3, None],
+        [True, False, True],
+        pd.Series(["a", " ", "v"], dtype="category"),
+        pd.IntervalIndex.from_breaks([0, 1, 2, 3]),
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    [
+        "is_numeric",
+        "is_boolean",
+        "is_integer",
+        "is_floating",
+        "is_object",
+        "is_categorical",
+        "is_interval",
+    ],
+)
+def test_index_type_methods(data, func):
+    pidx = pd.Index(data)
+    gidx = cudf.from_pandas(pidx)
+
+    expected = getattr(pidx, func)()
+    actual = getattr(gidx, func)()
+
+    if gidx.dtype == np.dtype("bool") and func == "is_object":
+        assert_eq(False, actual)
+    else:
+        assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index e550c7c374e..90a20e2bab4 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -361,7 +361,10 @@ def test_dataframe_loc_duplicate_index_scalar():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4, 5]}, index=[1, 2, 1, 4, 2])
     gdf = cudf.DataFrame.from_pandas(pdf)
 
-    assert_eq(pdf.loc[2], gdf.loc[2])
+    pdf_sorted = pdf.sort_values(by=list(pdf.columns), axis=0)
+    gdf_sorted = gdf.sort_values(by=list(gdf.columns), axis=0)
+
+    assert_eq(pdf_sorted, gdf_sorted)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index 775b866f5ce..88e822c27c4 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -178,8 +178,11 @@ def test_dataframe_join_suffix():
     # Check
     assert list(expect.columns) == list(got.columns)
     assert_eq(expect.index.values, got.index.values)
-    for k in expect.columns:
-        _check_series(expect[k].fillna(-1), got[k].fillna(-1))
+
+    got_sorted = got.sort_values(by=list(got.columns), axis=0)
+    expect_sorted = expect.sort_values(by=list(expect.columns), axis=0)
+    for k in expect_sorted.columns:
+        _check_series(expect_sorted[k].fillna(-1), got_sorted[k].fillna(-1))
 
 
 def test_dataframe_join_cats():
@@ -1356,7 +1359,9 @@ def test_categorical_typecast_inner():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"], check_categorical=False)
+    assert_join_results_equal(
+        expect_data, result["key"], how="inner", check_categorical=False
+    )
 
     # Equal categories, unequal ordering -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1374,7 +1379,9 @@ def test_categorical_typecast_inner():
 
     expect_dtype = cudf.CategoricalDtype(categories=[2, 3], ordered=False)
     expect_data = cudf.Series([2, 3], dtype=expect_dtype, name="key")
-    assert_eq(expect_data, result["key"], check_categorical=False)
+    assert_join_results_equal(
+        expect_data, result["key"], how="inner", check_categorical=False
+    )
 
     # One is ordered -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1404,7 +1411,7 @@ def test_categorical_typecast_left():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"])
+    assert_join_results_equal(expect_data, result["key"], how="left")
 
     # equal categories, unequal ordering -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=True)
@@ -1423,7 +1430,7 @@ def test_categorical_typecast_left():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True))
+    assert_join_results_equal(expect_data, result["key"], how="left")
 
     # unequal categories, unequal ordering -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=True)
@@ -1458,7 +1465,7 @@ def test_categorical_typecast_outer():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=False)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"])
+    assert_join_results_equal(expect_data, result["key"], how="outer")
 
     # equal categories, both ordered -> common dtype
     left = make_categorical_dataframe([1, 2, 3], ordered=True)
@@ -1468,7 +1475,7 @@ def test_categorical_typecast_outer():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3], ordered=True)
     expect_data = cudf.Series([1, 2, 3], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"])
+    assert_join_results_equal(expect_data, result["key"], how="outer")
 
     # equal categories, one ordered -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
@@ -1487,7 +1494,7 @@ def test_categorical_typecast_outer():
     expect_dtype = CategoricalDtype(categories=[1, 2, 3, 4], ordered=False)
     expect_data = cudf.Series([1, 2, 3, 4], dtype=expect_dtype, name="key")
 
-    assert_eq(expect_data, result["key"].sort_values().reset_index(drop=True))
+    assert_join_results_equal(expect_data, result["key"], how="outer")
 
     # unequal categories, one ordered -> error
     left = make_categorical_dataframe([1, 2, 3], ordered=False)
diff --git a/python/cudf/cudf/tests/test_label_encode.py b/python/cudf/cudf/tests/test_label_encode.py
index 106179e2b47..e5c6bacf1d1 100644
--- a/python/cudf/cudf/tests/test_label_encode.py
+++ b/python/cudf/cudf/tests/test_label_encode.py
@@ -30,6 +30,8 @@ def _random(nelem, dtype):
 _param_dtypes = [np.int32, np.float32]
 
 
+@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
+@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
 @pytest.mark.parametrize(
     "nelem,dtype", list(product(_param_sizes, _param_dtypes))
 )
@@ -57,6 +59,8 @@ def test_label_encode(nelem, dtype):
     assert df2.columns[1] == "cats_labels"
 
 
+@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
+@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
 def test_label_encode_drop_one():
     random.seed(0)
     np.random.seed(0)
@@ -92,6 +96,7 @@ def test_label_encode_drop_one():
     assert df2.columns[1] == "cats_labels"
 
 
+@pytest.mark.filterwarnings("ignore:DataFrame.label_encoding is deprecated")
 def test_label_encode_float_output():
     random.seed(0)
     np.random.seed(0)
@@ -116,6 +121,7 @@ def test_label_encode_float_output():
     np.testing.assert_equal(got, handcoded)
 
 
+@pytest.mark.filterwarnings("ignore:Series.label_encoding is deprecated")
 @pytest.mark.parametrize(
     "ncats,cat_dtype", [(10, np.int8), (127, np.int8), (128, np.int16)]
 )
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index abd24ddd0fd..2b71ca7ac36 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -540,6 +540,26 @@ def test_listcol_setitem(data, item):
     assert_eq(expect, sr)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+        [
+            [[1, 2, 3], [4, 5, 6]],
+            [[1, 2, 3], [4, 5, 6]],
+            [[1, 2, 3], [4, 5, 6]],
+        ],
+        [[[1, 2, 3], [4, None, 6]], [], None, [[7, 8], [], None, [9]]],
+        [[1, 2, 3], [4, None, 6], [7, 8], [], None, [9]],
+        [[1.0, 2.0, 3.0], [4.0, None, 6.0], [7.0, 8.0], [], None, [9.0]],
+    ],
+)
+def test_listcol_as_string(data):
+    got = cudf.Series(data).astype("str")
+    expect = pd.Series(data).astype("str")
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "data,item,error",
     [
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index e2b1d72c63e..d409a099806 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -299,7 +299,7 @@ def test_multiindex_loc(pdf, gdf, pdfIndex, key_tuple):
     assert_eq(pdfIndex, gdfIndex)
     pdf.index = pdfIndex
     gdf.index = gdfIndex
-    assert_eq(pdf.loc[key_tuple], gdf.loc[key_tuple])
+    assert_eq(pdf.loc[key_tuple].sort_index(), gdf.loc[key_tuple].sort_index())
 
 
 @pytest.mark.parametrize(
@@ -666,8 +666,7 @@ def test_multiindex_equals():
 )
 @pytest.mark.parametrize("names", [["X", "Y"]])
 def test_multiindex_copy_sem(data, levels, codes, names):
-    """Test semantic equality for MultiIndex.copy
-    """
+    """Test semantic equality for MultiIndex.copy"""
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
@@ -740,8 +739,8 @@ def test_multiindex_copy_sem(data, levels, codes, names):
 @pytest.mark.parametrize("deep", [True, False])
 def test_multiindex_copy_deep(data, deep):
     """Test memory idendity for deep copy
-        Case1: Constructed from GroupBy, StringColumns
-        Case2: Constrcuted from MultiIndex, NumericColumns
+    Case1: Constructed from GroupBy, StringColumns
+    Case2: Constrcuted from MultiIndex, NumericColumns
     """
     same_ref = not deep
 
@@ -967,26 +966,34 @@ def test_multiindex_rows_with_wildcard(pdf, gdf, pdfIndex):
     gdfIndex = cudf.from_pandas(pdfIndex)
     pdf.index = pdfIndex
     gdf.index = gdfIndex
-    assert_eq(pdf.loc[("a",), :], gdf.loc[("a",), :])
-    assert_eq(pdf.loc[(("a"), ("store")), :], gdf.loc[(("a"), ("store")), :])
+    assert_eq(pdf.loc[("a",), :].sort_index(), gdf.loc[("a",), :].sort_index())
     assert_eq(
-        pdf.loc[(("a"), ("store"), ("storm")), :],
-        gdf.loc[(("a"), ("store"), ("storm")), :],
+        pdf.loc[(("a"), ("store")), :].sort_index(),
+        gdf.loc[(("a"), ("store")), :].sort_index(),
     )
     assert_eq(
-        pdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :],
-        gdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :],
+        pdf.loc[(("a"), ("store"), ("storm")), :].sort_index(),
+        gdf.loc[(("a"), ("store"), ("storm")), :].sort_index(),
     )
     assert_eq(
-        pdf.loc[(slice(None), "store"), :], gdf.loc[(slice(None), "store"), :]
+        pdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(),
+        gdf.loc[(("a"), ("store"), ("storm"), ("smoke")), :].sort_index(),
     )
     assert_eq(
-        pdf.loc[(slice(None), slice(None), "storm"), :],
-        gdf.loc[(slice(None), slice(None), "storm"), :],
+        pdf.loc[(slice(None), "store"), :].sort_index(),
+        gdf.loc[(slice(None), "store"), :].sort_index(),
     )
     assert_eq(
-        pdf.loc[(slice(None), slice(None), slice(None), "smoke"), :],
-        gdf.loc[(slice(None), slice(None), slice(None), "smoke"), :],
+        pdf.loc[(slice(None), slice(None), "storm"), :].sort_index(),
+        gdf.loc[(slice(None), slice(None), "storm"), :].sort_index(),
+    )
+    assert_eq(
+        pdf.loc[
+            (slice(None), slice(None), slice(None), "smoke"), :
+        ].sort_index(),
+        gdf.loc[
+            (slice(None), slice(None), slice(None), "smoke"), :
+        ].sort_index(),
     )
 
 
@@ -1574,6 +1581,101 @@ def test_difference():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "idx1, idx2",
+    [
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
+            ),
+            pd.MultiIndex.from_arrays(
+                [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]]
+            ),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]],
+                names=["a", "b"],
+            ),
+            pd.MultiIndex.from_arrays(
+                [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]],
+                names=["x", "y"],
+            ),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+                names=["a", "b", "c"],
+            ),
+            pd.MultiIndex.from_arrays(
+                [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]]
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("sort", [None, False])
+def test_union_mulitIndex(idx1, idx2, sort):
+    expected = idx1.union(idx2, sort=sort)
+
+    idx1 = cudf.from_pandas(idx1)
+    idx2 = cudf.from_pandas(idx2)
+
+    actual = idx1.union(idx2, sort=sort)
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx1, idx2",
+    [
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
+            ),
+            pd.MultiIndex.from_arrays(
+                [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]]
+            ),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]],
+                names=["a", "b"],
+            ),
+            pd.MultiIndex.from_arrays(
+                [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]],
+                names=["x", "y"],
+            ),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+                names=["a", "b", "c"],
+            ),
+            pd.MultiIndex.from_arrays(
+                [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]]
+            ),
+        ),
+        (
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+                names=["a", "b", "c"],
+            ),
+            pd.MultiIndex.from_arrays(
+                [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            ),
+        ),
+    ],
+)
+@pytest.mark.parametrize("sort", [None, False])
+def test_intersection_mulitIndex(idx1, idx2, sort):
+    expected = idx1.intersection(idx2, sort=sort)
+
+    idx1 = cudf.from_pandas(idx1)
+    idx2 = cudf.from_pandas(idx2)
+
+    actual = idx1.intersection(idx2, sort=sort)
+    assert_eq(expected, actual, exact=False)
+
+
 @pytest.mark.parametrize(
     "names",
     [
@@ -1601,3 +1703,42 @@ def test_pickle_roundtrip_multiIndex(names):
     local_file.seek(0)
     actual_df = pickle.load(local_file)
     assert_eq(expected_df, actual_df)
+
+
+@pytest.mark.parametrize(
+    "pidx",
+    [
+        pd.MultiIndex.from_arrays(
+            [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]]
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]],
+            names=["a", "b", "c"],
+        ),
+        pd.MultiIndex.from_arrays(
+            [[1.0, 2, 3, 4], [5, 6, 7.8, 10], [11, 12, 12, 13]],
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "func",
+    [
+        "is_numeric",
+        "is_boolean",
+        "is_integer",
+        "is_floating",
+        "is_object",
+        "is_categorical",
+        "is_interval",
+    ],
+)
+def test_multiIndex_type_methods(pidx, func):
+    gidx = cudf.from_pandas(pidx)
+
+    expected = getattr(pidx, func)()
+    actual = getattr(gidx, func)()
+
+    if func == "is_object":
+        assert_eq(False, actual)
+    else:
+        assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 0292d47f31a..ed55fb86820 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -195,10 +195,10 @@ def test_get_dummies_with_nan():
     )
     expected = cudf.DataFrame(
         {
+            "a_null": [0, 0, 0, 1],
             "a_1.0": [1, 0, 0, 0],
             "a_2.0": [0, 1, 0, 0],
             "a_nan": [0, 0, 1, 0],
-            "a_null": [0, 0, 0, 1],
         },
         dtype="uint8",
     )
@@ -220,7 +220,7 @@ def test_get_dummies_with_nan():
 @pytest.mark.parametrize("prefix", [None, "hi"])
 @pytest.mark.parametrize("dtype", ["uint8", "int16"])
 def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
-    expected = cudf.get_dummies(
+    actual = cudf.get_dummies(
         data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
     )
     if isinstance(data, (cudf.Series, cudf.BaseIndex)):
@@ -228,7 +228,7 @@ def test_get_dummies_array_like(data, prefix_sep, prefix, dtype):
     else:
         pd_data = data
 
-    actual = pd.get_dummies(
+    expected = pd.get_dummies(
         pd_data, prefix=prefix, prefix_sep=prefix_sep, dtype=dtype
     )
     utils.assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 1230b4b35f3..99b5652110b 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -134,7 +134,7 @@ def test_orc_reader_trailing_nulls(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     expect = orcfile.read().to_pandas().fillna(0)
-    got = cudf.read_orc(path, engine="cudf").fillna(0)
+    got = cudf.read_orc(path).fillna(0)
 
     # PANDAS uses NaN to represent invalid data, which forces float dtype
     # For comparison, we can replace NaN with 0 and cast to the cuDF dtype
@@ -157,7 +157,7 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas(date_as_object=False)
-    gdf = cudf.read_orc(path, engine="cudf", use_index=use_index)
+    gdf = cudf.read_orc(path, use_index=use_index)
 
     assert_eq(pdf, gdf, check_categorical=False)
 
@@ -170,7 +170,7 @@ def test_orc_reader_strings(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     expect = orcfile.read(columns=["string1"])
-    got = cudf.read_orc(path, engine="cudf", columns=["string1"])
+    got = cudf.read_orc(path, columns=["string1"])
 
     assert_eq(expect, got, check_categorical=False)
 
@@ -291,9 +291,7 @@ def test_orc_read_rows(datadir, skiprows, num_rows):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(
-        path, engine="cudf", skiprows=skiprows, num_rows=num_rows
-    )
+    gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows)
 
     # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
     # have an API to read a subsection of rows from the file
@@ -335,7 +333,7 @@ def test_orc_reader_uncompressed_block(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     expect = orcfile.read().to_pandas()
-    got = cudf.read_orc(path, engine="cudf")
+    got = cudf.read_orc(path)
 
     assert_eq(expect, got, check_categorical=False)
 
@@ -351,7 +349,7 @@ def test_orc_reader_nodata_block(datadir):
             print(type(excpr).__name__)
 
     expect = orcfile.read().to_pandas()
-    got = cudf.read_orc(path, engine="cudf", num_rows=1)
+    got = cudf.read_orc(path, num_rows=1)
 
     assert_eq(expect, got, check_categorical=False)
 
@@ -570,7 +568,7 @@ def test_orc_reader_tzif_timestamps(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path, engine="cudf").to_pandas()
+    gdf = cudf.read_orc(path).to_pandas()
 
     assert_eq(pdf, gdf)
 
@@ -634,6 +632,10 @@ def test_orc_write_statistics(tmpdir, datadir, nrows):
             stats_max = file_stats[0][col]["maximum"]
             actual_max = gdf[col].max()
             assert normalized_equals(actual_max, stats_max)
+        if "number_of_values" in file_stats[0][col]:
+            stats_num_vals = file_stats[0][col]["number_of_values"]
+            actual_num_vals = gdf[col].count()
+            assert stats_num_vals == actual_num_vals
 
     # compare stripe statistics with actual min/max
     for stripe_idx in range(0, orc_file.nstripes):
@@ -651,6 +653,13 @@ def test_orc_write_statistics(tmpdir, datadir, nrows):
                 stats_max = stripes_stats[stripe_idx][col]["maximum"]
                 assert normalized_equals(actual_max, stats_max)
 
+            if "number_of_values" in stripes_stats[stripe_idx][col]:
+                stats_num_vals = stripes_stats[stripe_idx][col][
+                    "number_of_values"
+                ]
+                actual_num_vals = stripe_df[col].count()
+                assert stats_num_vals == actual_num_vals
+
 
 @pytest.mark.parametrize("nrows", [1, 100, 6000000])
 def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
@@ -704,7 +713,7 @@ def test_orc_reader_gmt_timestamps(datadir):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path, engine="cudf").to_pandas()
+    gdf = cudf.read_orc(path).to_pandas()
     assert_eq(pdf, gdf)
 
 
@@ -803,9 +812,7 @@ def test_orc_reader_multiple_files(datadir, num_rows):
     df_2 = pd.read_orc(path)
     df = pd.concat([df_1, df_2], ignore_index=True)
 
-    gdf = cudf.read_orc(
-        [path, path], engine="cudf", num_rows=num_rows
-    ).to_pandas()
+    gdf = cudf.read_orc([path, path], num_rows=num_rows).to_pandas()
 
     # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
     # have an API to read a subsection of rows from the file
@@ -821,13 +828,13 @@ def test_orc_reader_multi_file_single_stripe(datadir):
 
     # should raise an exception
     with pytest.raises(ValueError):
-        cudf.read_orc([path, path], engine="cudf", stripes=[0])
+        cudf.read_orc([path, path], stripes=[0])
 
 
 def test_orc_reader_multi_file_multi_stripe(datadir):
 
     path = datadir / "TestOrcFile.testStripeLevelStats.orc"
-    gdf = cudf.read_orc([path, path], engine="cudf", stripes=[[0, 1], [2]])
+    gdf = cudf.read_orc([path, path], stripes=[[0, 1], [2]])
     pdf = pd.read_orc(path)
     assert_eq(pdf, gdf)
 
@@ -1197,9 +1204,7 @@ def test_orc_reader_decimal(datadir, data):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(
-        path, engine="cudf", decimal_cols_as_float=data
-    ).to_pandas()
+    gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas()
 
     # Convert the decimal dtype from PyArrow to float64 for comparison to cuDF
     # This is because cuDF returns as float64
@@ -1217,9 +1222,7 @@ def test_orc_reader_decimal_invalid_column(datadir, data):
         pytest.skip(".orc file is not found: %s" % e)
 
     pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(
-        path, engine="cudf", decimal_cols_as_float=data
-    ).to_pandas()
+    gdf = cudf.read_orc(path, decimal_cols_as_float=data).to_pandas()
 
     # Since the `decimal_cols_as_float` column name
     # is invalid, this should be a decimal
@@ -1254,11 +1257,6 @@ def dec(num):
             "lf": [[i * 0.5] * 13 for i in range(12345)],
             "ld": [[dec(i / 2)] * 15 for i in range(12345)],
         },
-        # multiple stripes
-        {
-            "ls": [[str(i), str(2 * i)] for i in range(1_200_000)],
-            "ld": [[dec(i / 2)] * 5 for i in range(1_200_000)],
-        },
         # with nulls
         {
             "ls": [
@@ -1303,7 +1301,9 @@ def test_orc_writer_lists(data):
     pdf_in = pd.DataFrame(data)
 
     buffer = BytesIO()
-    cudf.from_pandas(pdf_in).to_orc(buffer)
+    cudf.from_pandas(pdf_in).to_orc(
+        buffer, stripe_size_rows=2048, row_index_stride=512
+    )
 
     pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
     assert_eq(pdf_out, pdf_in)
@@ -1492,3 +1492,37 @@ def test_empty_statistics():
         assert stats[0]["i"].get("minimum") == 1
         assert stats[0]["i"].get("maximum") == 1
         assert stats[0]["i"].get("sum") == 1
+
+
+@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
+@pytest.mark.parametrize(
+    "equivalent_columns",
+    [
+        (["lvl1_struct.a", "lvl1_struct.b"], ["lvl1_struct"]),
+        (["lvl1_struct", "lvl1_struct.a"], ["lvl1_struct"]),
+        (["lvl1_struct.a", "lvl1_struct"], ["lvl1_struct"]),
+        (["lvl1_struct.b", "lvl1_struct.a"], ["lvl1_struct.b", "lvl1_struct"]),
+        (["lvl2_struct.lvl1_struct", "lvl2_struct"], ["lvl2_struct"]),
+        (
+            ["lvl2_struct.a", "lvl2_struct.lvl1_struct.c", "lvl2_struct"],
+            ["lvl2_struct"],
+        ),
+    ],
+)
+def test_select_nested(list_struct_buff, equivalent_columns):
+    # The two column selections should be equivalent
+    df_cols1 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[0])
+    df_cols2 = cudf.read_orc(list_struct_buff, columns=equivalent_columns[1])
+    assert_eq(df_cols1, df_cols2)
+
+
+def test_orc_writer_rle_stream_size(datadir, tmpdir):
+    original = datadir / "TestOrcFile.int16.rle.size.orc"
+    reencoded = tmpdir.join("int16_map.orc")
+
+    df = cudf.read_orc(original)
+    df.to_orc(reencoded)
+
+    # Segfaults when RLE stream sizes don't account for varint length
+    pa_out = pa.orc.ORCFile(reencoded).read()
+    assert_eq(df.to_pandas(), pa_out)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 23f971d0f77..9629e502584 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -17,6 +17,8 @@
 from packaging import version
 from pyarrow import fs as pa_fs, parquet as pq
 
+import rmm
+
 import cudf
 from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata
 from cudf.testing import dataset_generator as dg
@@ -24,6 +26,7 @@
     TIMEDELTA_TYPES,
     assert_eq,
     assert_exceptions_equal,
+    random_bitmask,
 )
 
 
@@ -32,7 +35,7 @@ def datadir(datadir):
     return datadir / "parquet"
 
 
-@pytest.fixture(params=[1, 5, 10, 100])
+@pytest.fixture(params=[1, 5, 10, 100000])
 def simple_pdf(request):
     types = [
         "bool",
@@ -149,12 +152,12 @@ def build_pdf(num_columns, day_resolution_timestamps):
     return test_pdf
 
 
-@pytest.fixture(params=[0, 1, 10, 100])
+@pytest.fixture(params=[0, 1, 10, 10000])
 def pdf(request):
     return build_pdf(request, False)
 
 
-@pytest.fixture(params=[0, 1, 10, 100])
+@pytest.fixture(params=[0, 1, 10, 10000])
 def pdf_day_timestamps(request):
     return build_pdf(request, True)
 
@@ -171,7 +174,9 @@ def gdf_day_timestamps(pdf_day_timestamps):
 
 @pytest.fixture(params=["snappy", "gzip", "brotli", None, np.str_("snappy")])
 def parquet_file(request, tmp_path_factory, pdf):
-    fname = tmp_path_factory.mktemp("parquet") / "test.parquet"
+    fname = tmp_path_factory.mktemp("parquet") / (
+        str(request.param) + "_test.parquet"
+    )
     pdf.to_parquet(fname, engine="pyarrow", compression=request.param)
     return fname
 
@@ -241,6 +246,12 @@ def _make_parquet_path_or_buf(src):
     ],
 )
 def test_parquet_reader_basic(parquet_file, columns, engine):
+    if rmm._cuda.gpu.runtimeGetVersion() == 11050 and "brotli_" in str(
+        parquet_file
+    ):
+        pytest.xfail(
+            "Known issue: https://github.com/rapidsai/cudf/issues/9546"
+        )
     expect = pd.read_parquet(parquet_file, columns=columns)
     got = cudf.read_parquet(parquet_file, engine=engine, columns=columns)
     if len(expect) == 0:
@@ -372,6 +383,9 @@ def test_parquet_reader_pandas_metadata(tmpdir, columns, pandas_compat):
 
 
 def test_parquet_read_metadata(tmpdir, pdf):
+    if len(pdf) > 100:
+        pytest.skip("Skipping long setup test")
+
     def num_row_groups(rows, group_size):
         return max(1, (rows + (group_size - 1)) // group_size)
 
@@ -505,6 +519,9 @@ def test_parquet_read_filtered_complex_predicate(
 
 @pytest.mark.parametrize("row_group_size", [1, 5, 100])
 def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
+    if len(pdf) > 100:
+        pytest.skip("Skipping long setup test")
+
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category"])
     fname = tmpdir.join("row_group.parquet")
@@ -529,6 +546,9 @@ def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
 
 @pytest.mark.parametrize("row_group_size", [1, 5, 100])
 def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
+    if len(pdf) > 100:
+        pytest.skip("Skipping long setup test")
+
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
@@ -554,6 +574,9 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
 
 @pytest.mark.parametrize("row_group_size", [1, 4, 33])
 def test_parquet_read_rows(tmpdir, pdf, row_group_size):
+    if len(pdf) > 100:
+        pytest.skip("Skipping long setup test")
+
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="None", row_group_size=row_group_size)
 
@@ -1867,6 +1890,10 @@ def test_parquet_allnull_str(tmpdir, engine):
 
 
 def normalized_equals(value1, value2):
+    if value1 is pd.NA or value1 is pd.NaT:
+        value1 = None
+    if value2 is pd.NA or value2 is pd.NaT:
+        value2 = None
     if isinstance(value1, pd.Timestamp):
         value1 = value1.to_pydatetime()
     if isinstance(value2, pd.Timestamp):
@@ -1887,15 +1914,22 @@ def normalized_equals(value1, value2):
     return value1 == value2
 
 
-def test_parquet_writer_statistics(tmpdir, pdf):
+@pytest.mark.parametrize("add_nulls", [True, False])
+def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
 
-    for t in TIMEDELTA_TYPES:
-        pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
+    if not add_nulls:
+        # Timedelta types convert NA to None when reading from parquet into
+        # pandas which interferes with series.max()/min()
+        for t in TIMEDELTA_TYPES:
+            pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
 
     gdf = cudf.from_pandas(pdf)
+    if add_nulls:
+        for col in gdf:
+            gdf[col] = gdf[col].set_mask(random_bitmask(len(gdf)))
     gdf.to_parquet(file_path, index=False)
 
     # Read back from pyarrow
@@ -1916,6 +1950,9 @@ def test_parquet_writer_statistics(tmpdir, pdf):
             stats_max = stats.max
             assert normalized_equals(actual_max, stats_max)
 
+            assert stats.null_count == pd_slice[col].isna().sum()
+            assert stats.num_values == pd_slice[col].count()
+
 
 def test_parquet_writer_list_statistics(tmpdir):
     df = pd.DataFrame(
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 0f8b46cee35..28e63ec41f1 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import pytest
 
-from cudf import DataFrame, GenericIndex, Series
+from cudf import DataFrame, GenericIndex, RangeIndex, Series
 from cudf.core.buffer import Buffer
 from cudf.testing._utils import assert_eq
 
@@ -28,7 +28,7 @@ def check_serialization(df):
     assert_frame_picklable(df[2:-2])
     # sorted
     sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, GenericIndex)
+    assert isinstance(sortvaldf.index, (GenericIndex, RangeIndex))
     assert_frame_picklable(sortvaldf)
     # out-of-band
     if pickle.HIGHEST_PROTOCOL >= 5:
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index f47e87374dc..2e7936feeae 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -58,7 +58,10 @@ def test_series_replace_all(gsr, to_replace, value):
     actual = gsr.replace(to_replace=gd_to_replace, value=gd_value)
     expected = psr.replace(to_replace=pd_to_replace, value=pd_value)
 
-    assert_eq(expected, actual)
+    assert_eq(
+        expected.sort_values().reset_index(drop=True),
+        actual.sort_values().reset_index(drop=True),
+    )
 
 
 def test_series_replace():
@@ -75,7 +78,10 @@ def test_series_replace():
     psr4 = psr3.replace("one", "two")
     sr3 = cudf.from_pandas(psr3)
     sr4 = sr3.replace("one", "two")
-    assert_eq(psr4, sr4)
+    assert_eq(
+        psr4.sort_values().reset_index(drop=True),
+        sr4.sort_values().reset_index(drop=True),
+    )
 
     psr5 = psr3.replace("one", "five")
     sr5 = sr3.replace("one", "five")
@@ -226,7 +232,10 @@ def test_dataframe_replace(df, to_replace, value):
     expected = pdf.replace(to_replace=pd_to_replace, value=pd_value)
     actual = gdf.replace(to_replace=gd_to_replace, value=gd_value)
 
-    assert_eq(expected, actual)
+    expected_sorted = expected.sort_values(by=list(expected.columns), axis=0)
+    actual_sorted = actual.sort_values(by=list(actual.columns), axis=0)
+
+    assert_eq(expected_sorted, actual_sorted)
 
 
 def test_dataframe_replace_with_nulls():
@@ -1103,6 +1112,8 @@ def test_dataframe_exceptions_for_clip(lower, upper):
         ([1, 2, 3, 4, 5], None, 4),
         ([1, 2, 3, 4, 5], None, None),
         ([1, 2, 3, 4, 5], 4, 2),
+        ([1.0, 2.0, 3.0, 4.0, 5.0], 4, 2),
+        (pd.Series([1, 2, 3, 4, 5], dtype="int32"), 4, 2),
         (["a", "b", "c", "d", "e"], "b", "d"),
         (["a", "b", "c", "d", "e"], "b", None),
         (["a", "b", "c", "d", "e"], None, "d"),
@@ -1112,7 +1123,7 @@ def test_dataframe_exceptions_for_clip(lower, upper):
 @pytest.mark.parametrize("inplace", [True, False])
 def test_series_clip(data, lower, upper, inplace):
     psr = pd.Series(data)
-    gsr = cudf.Series.from_pandas(data)
+    gsr = cudf.from_pandas(psr)
 
     expect = psr.clip(lower=lower, upper=upper)
     got = gsr.clip(lower=lower, upper=upper, inplace=inplace)
@@ -1334,4 +1345,7 @@ def test_series_replace_errors():
 def test_replace_nulls(gsr, old, new, expected):
 
     actual = gsr.replace(old, new)
-    assert_eq(expected, actual)
+    assert_eq(
+        expected.sort_values().reset_index(drop=True),
+        actual.sort_values().reset_index(drop=True),
+    )
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 29272cbf876..c146766c5e1 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -185,7 +185,18 @@ def test_rolling_var_std_large(agg, ddof, center, seed, window_size):
     expect = getattr(pdf.rolling(window_size, 1, center), agg)(ddof=ddof)
     got = getattr(gdf.rolling(window_size, 1, center), agg)(ddof=ddof)
 
-    assert_eq(expect, got, **kwargs)
+    import platform
+
+    if platform.machine() == "aarch64":
+        # Due to pandas-37051, pandas rolling var/std on uniform window is
+        # not reliable. Skipping these rows when comparing.
+        for col in expect:
+            mask = (got[col].fillna(-1) != 0).to_pandas()
+            expect[col] = expect[col][mask]
+            got[col] = got[col][mask]
+            assert_eq(expect[col], got[col], **kwargs)
+    else:
+        assert_eq(expect, got, **kwargs)
 
 
 @pytest.mark.xfail
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index e23e8bbef89..ff551ec74ca 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -130,13 +130,25 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     fname = "test_csv_reader.csv"
     bname = "csv"
     buffer = pdf.to_csv(index=False)
+
+    # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_csv(
             "s3://{}/{}".format(bname, fname),
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
+            use_python_file_object=False,
         )
+    assert_eq(pdf, got)
 
+    # Use Arrow PythonFile object
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        got = cudf.read_csv(
+            "s3://{}/{}".format(bname, fname),
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
+            use_python_file_object=True,
+        )
     assert_eq(pdf, got)
 
 
@@ -156,19 +168,25 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
 
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
-def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread):
+@pytest.mark.parametrize("use_python_file_object", [True, False])
+def test_read_csv_byte_range(
+    s3_base, s3so, pdf, bytes_per_thread, use_python_file_object
+):
     # Write to buffer
     fname = "test_csv_reader_byte_range.csv"
     bname = "csv"
     buffer = pdf.to_csv(index=False)
+
+    # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_csv(
             "s3://{}/{}".format(bname, fname),
             storage_options=s3so,
             byte_range=(74, 73),
             bytes_per_thread=bytes_per_thread,
-            header=False,
+            header=None,
             names=["Integer", "Float", "Integer2", "String", "Boolean"],
+            use_python_file_object=use_python_file_object,
         )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
@@ -254,7 +272,8 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     assert_eq(expect, got)
 
 
-def test_read_parquet_filters(s3_base, s3so, pdf):
+@pytest.mark.parametrize("python_file", [True, False])
+def test_read_parquet_filters(s3_base, s3so, pdf, python_file):
     fname = "test_parquet_reader_filters.parquet"
     bname = "parquet"
     buffer = BytesIO()
@@ -266,6 +285,7 @@ def test_read_parquet_filters(s3_base, s3so, pdf):
             "s3://{}/{}".format(bname, fname),
             storage_options=s3so,
             filters=filters,
+            use_python_file_object=python_file,
         )
 
     # All row-groups should be filtered out
@@ -308,7 +328,9 @@ def test_read_json(s3_base, s3so):
     assert_eq(expect, got)
 
 
-def test_read_orc(s3_base, s3so, datadir):
+@pytest.mark.parametrize("use_python_file_object", [False, True])
+@pytest.mark.parametrize("columns", [None, ["string1"]])
+def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bname = "orc"
@@ -319,9 +341,36 @@ def test_read_orc(s3_base, s3so, datadir):
 
     with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
         got = cudf.read_orc(
-            "s3://{}/{}".format(bname, fname), storage_options=s3so
+            "s3://{}/{}".format(bname, fname),
+            columns=columns,
+            storage_options=s3so,
+            use_python_file_object=use_python_file_object,
         )
 
+    if columns:
+        expect = expect[columns]
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("columns", [None, ["string1"]])
+def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
+    source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
+    fname = "test_orc_reader.orc"
+    bname = "orc"
+    expect = pa.orc.ORCFile(source_file).read().to_pandas()
+
+    with open(source_file, "rb") as f:
+        buffer = f.read()
+
+    with s3_context(s3_base=s3_base, bucket=bname, files={fname: buffer}):
+        fs = pa_fs.S3FileSystem(
+            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+        )
+        with fs.open_input_file("{}/{}".format(bname, fname)) as fil:
+            got = cudf.read_orc(fil, columns=columns)
+
+    if columns:
+        expect = expect[columns]
     assert_eq(expect, got)
 
 
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index b436825cf69..440dcf527ca 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -44,7 +44,7 @@
 )
 @pytest.mark.parametrize("to_host", [True, False])
 def test_serialize(df, to_host):
-    """ This should hopefully replace all functions below """
+    """This should hopefully replace all functions below"""
     a = df()
     if "cudf" not in type(a).__module__:
         a = cudf.from_pandas(a)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index ca179703864..09f0417b7ac 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -673,7 +673,7 @@ def test_series_mode(df, dropna):
         np.arange(-100.5, 101.5, 1),
     ],
 )
-@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12])
+@pytest.mark.parametrize("decimals", [-5, -3, -1, 0, 1, 4, 12, np.int8(1)])
 def test_series_round(arr, decimals):
     pser = pd.Series(arr)
     ser = cudf.Series(arr)
@@ -1272,3 +1272,66 @@ def test_series_sort_index(
         assert_eq(ps, gs, check_index_type=True)
     else:
         assert_eq(expected, got, check_index_type=True)
+
+
+@pytest.mark.parametrize(
+    "method,validation_data",
+    [
+        (
+            "md5",
+            [
+                "d41d8cd98f00b204e9800998ecf8427e",
+                "cfcd208495d565ef66e7dff9f98764da",
+                "3d3aaae21d57b101227f0384f644abe0",
+                "3e76c7023d771ad1c1520c27ab3d4874",
+                "f8d805e33ec3ade1a6ea251ac1c118e7",
+                "c30515f66a5aec7af7666abf33600c92",
+                "c61a4185135eda043f35e92c3505e180",
+                "52da74c75cb6575d25be29e66bd0adde",
+                "5152ac13bdd09110d9ee9c169a3d9237",
+                "f1d3ff8443297732862df21dc4e57262",
+            ],
+        )
+    ],
+)
+def test_series_hash_values(method, validation_data):
+    inputs = cudf.Series(
+        [
+            "",
+            "0",
+            "A 56 character string to test message padding algorithm.",
+            "A 63 character string to test message padding algorithm, again.",
+            "A 64 character string to test message padding algorithm, again!!",
+            (
+                "A very long (greater than 128 bytes/char string) to execute "
+                "a multi hash-step data point in the hash function being "
+                "tested. This string needed to be longer."
+            ),
+            "All work and no play makes Jack a dull boy",
+            "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+            "\x00\x00\x00\x10\x00\x00\x00\x00",
+            "\x00\x00\x00\x00",
+        ]
+    )
+    validation_results = cudf.Series(validation_data)
+    hash_values = inputs.hash_values(method=method)
+    assert_eq(hash_values, validation_results)
+
+
+def test_set_index_unequal_length():
+    s = cudf.Series()
+    with pytest.raises(ValueError):
+        s.index = [1, 2, 3]
+
+
+@pytest.mark.parametrize(
+    "lhs, rhs", [("a", "a"), ("a", "b"), (1, 1.0), (None, None), (None, "a")]
+)
+def test_equals_names(lhs, rhs):
+    lhs = cudf.Series([1, 2], name=lhs)
+    rhs = cudf.Series([1, 2], name=rhs)
+
+    got = lhs.equals(rhs)
+    expect = lhs.to_pandas().equals(rhs.to_pandas())
+
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 3a42411c839..00cd31e7539 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -117,8 +117,7 @@ def test_series_sort_index(nelem, asc):
 @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
 @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 7])
 def test_series_nlargest(data, n):
-    """Indirectly tests Series.sort_values()
-    """
+    """Indirectly tests Series.sort_values()"""
     sr = Series(data)
     psr = pd.Series(data)
     assert_eq(sr.nlargest(n), psr.nlargest(n))
@@ -136,8 +135,7 @@ def test_series_nlargest(data, n):
 @pytest.mark.parametrize("data", [[0, 1, 1, 2, 2, 2, 3, 3], [0], [1, 2, 3]])
 @pytest.mark.parametrize("n", [-100, -50, -12, -2, 0, 1, 2, 3, 4, 9])
 def test_series_nsmallest(data, n):
-    """Indirectly tests Series.sort_values()
-    """
+    """Indirectly tests Series.sort_values()"""
     sr = Series(data)
     psr = pd.Series(data)
     assert_eq(sr.nsmallest(n), psr.nsmallest(n))
@@ -156,33 +154,16 @@ def test_series_nsmallest(data, n):
 
 
 @pytest.mark.parametrize("nelem,n", [(1, 1), (100, 100), (10, 5), (100, 10)])
-def test_dataframe_nlargest(nelem, n):
+@pytest.mark.parametrize("op", ["nsmallest", "nlargest"])
+@pytest.mark.parametrize("columns", ["a", ["b", "a"]])
+def test_dataframe_nlargest_nsmallest(nelem, n, op, columns):
     np.random.seed(0)
-    df = DataFrame()
-    df["a"] = aa = np.random.random(nelem)
-    df["b"] = bb = np.random.random(nelem)
-    res = df.nlargest(n, "a")
-
-    # Check
-    inds = np.argsort(aa)
-    assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1])
-    assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1])
-    assert_eq(res.index.values, inds[-n:][::-1])
-
-
-@pytest.mark.parametrize("nelem,n", [(10, 5), (100, 10)])
-def test_dataframe_nsmallest(nelem, n):
-    np.random.seed(0)
-    df = DataFrame()
-    df["a"] = aa = np.random.random(nelem)
-    df["b"] = bb = np.random.random(nelem)
-    res = df.nsmallest(n, "a")
+    aa = np.random.random(nelem)
+    bb = np.random.random(nelem)
 
-    # Check
-    inds = np.argsort(-aa)
-    assert_eq(res["a"].to_numpy(), aa[inds][-n:][::-1])
-    assert_eq(res["b"].to_numpy(), bb[inds][-n:][::-1])
-    assert_eq(res.index.values, inds[-n:][::-1])
+    df = DataFrame({"a": aa, "b": bb})
+    pdf = df.to_pandas()
+    assert_eq(getattr(df, op)(n, columns), getattr(pdf, op)(n, columns))
 
 
 @pytest.mark.parametrize(
@@ -283,6 +264,27 @@ def test_dataframe_multi_column_nulls(
     )
 
 
+@pytest.mark.parametrize(
+    "ascending", list(product((True, False), (True, False)))
+)
+@pytest.mark.parametrize("na_position", ["first", "last"])
+def test_dataframe_multi_column_nulls_multiple_ascending(
+    ascending, na_position
+):
+    pdf = pd.DataFrame(
+        {"a": [3, 1, None, 2, 2, None, 1], "b": [1, 2, 3, 4, 5, 6, 7]}
+    )
+    gdf = DataFrame.from_pandas(pdf)
+    expect = pdf.sort_values(
+        by=["a", "b"], ascending=ascending, na_position=na_position
+    )
+    actual = gdf.sort_values(
+        by=["a", "b"], ascending=ascending, na_position=na_position
+    )
+
+    assert_eq(actual, expect)
+
+
 @pytest.mark.parametrize("nelem", [1, 100])
 def test_series_nlargest_nelem(nelem):
     np.random.seed(0)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index b254a6ba02c..c75eb91a335 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -183,7 +183,19 @@ def test_string_astype(dtype):
     ):
         data = ["1", "2", "3", "4", "5"]
     elif dtype.startswith("float"):
-        data = ["1.0", "2.0", "3.0", "4.0", "5.0"]
+        data = [
+            "1.0",
+            "2.0",
+            "3.0",
+            "4.0",
+            None,
+            "5.0",
+            "nan",
+            "-INF",
+            "NaN",
+            "inF",
+            "NAn",
+        ]
     elif dtype.startswith("bool"):
         data = ["True", "False", "True", "False", "False"]
     elif dtype.startswith("datetime64"):
@@ -834,12 +846,19 @@ def test_string_extract(ps_gs, pat, expand, flags, flags_raise):
         ("FGHI", False),
     ],
 )
-@pytest.mark.parametrize("flags,flags_raise", [(0, 0), (1, 1)])
+@pytest.mark.parametrize(
+    "flags,flags_raise",
+    [(0, 0), (re.MULTILINE | re.DOTALL, 0), (re.I, 1), (re.I | re.DOTALL, 1)],
+)
 @pytest.mark.parametrize("na,na_raise", [(np.nan, 0), (None, 1), ("", 1)])
 def test_string_contains(ps_gs, pat, regex, flags, flags_raise, na, na_raise):
     ps, gs = ps_gs
 
-    expectation = raise_builder([flags_raise, na_raise], NotImplementedError)
+    expectation = does_not_raise()
+    if flags_raise:
+        expectation = pytest.raises(ValueError)
+    if na_raise:
+        expectation = pytest.raises(NotImplementedError)
 
     with expectation:
         expect = ps.str.contains(pat, flags=flags, na=na, regex=regex)
@@ -873,7 +892,7 @@ def test_string_repeat(data, repeats):
     assert_eq(expect, got)
 
 
-# Pandas isn't respect the `n` parameter so ignoring it in test parameters
+# Pandas doesn't respect the `n` parameter so ignoring it in test parameters
 @pytest.mark.parametrize(
     "pat,regex",
     [("a", False), ("f", False), (r"[a-z]", True), (r"[A-Z]", True)],
@@ -1386,6 +1405,26 @@ def test_string_char_case(case_op, data):
     assert_eq(gs.str.isempty(), ps == "")
 
 
+def test_string_is_title():
+    data = [
+        "leopard",
+        "Golden Eagle",
+        "SNAKE",
+        "",
+        "!A",
+        "hello World",
+        "A B C",
+        "#",
+        "AƻB",
+        "Ⓑⓖ",
+        "Art of War",
+    ]
+    gs = cudf.Series(data)
+    ps = pd.Series(data)
+
+    assert_eq(gs.str.istitle(), ps.str.istitle())
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1708,15 +1747,22 @@ def test_string_wrap(data, width):
         ["23", "³", "⅕", ""],
         [" ", "\t\r\n ", ""],
         ["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
-        ["line to be wrapped", "another line to be wrapped"],
+        ["line\nto be wrapped", "another\nline\nto be wrapped"],
     ],
 )
-@pytest.mark.parametrize("pat", ["a", " ", "\t", "another", "0", r"\$"])
-def test_string_count(data, pat):
+@pytest.mark.parametrize(
+    "pat", ["a", " ", "\t", "another", "0", r"\$", "^line$", "line.*be"]
+)
+@pytest.mark.parametrize("flags", [0, re.MULTILINE, re.DOTALL])
+def test_string_count(data, pat, flags):
     gs = cudf.Series(data)
     ps = pd.Series(data)
 
-    assert_eq(gs.str.count(pat=pat), ps.str.count(pat=pat), check_dtype=False)
+    assert_eq(
+        gs.str.count(pat=pat, flags=flags),
+        ps.str.count(pat=pat, flags=flags),
+        check_dtype=False,
+    )
     assert_eq(as_index(gs).str.count(pat=pat), pd.Index(ps).str.count(pat=pat))
 
 
@@ -1765,6 +1811,7 @@ def test_string_replace_multi():
         "([a-z])-([a-zé])",
         "([a-z])-([a-z])",
         "([a-z])-([a-zé])",
+        re.compile("([A-Z])(\\d)"),
     ],
 )
 @pytest.mark.parametrize(
@@ -2071,6 +2118,32 @@ def test_string_contains_multi(data, sub, expect):
     assert_eq(expect, got, check_dtype=False)
 
 
+# Pandas does not allow 'case' or 'flags' if 'pat' is re.Pattern
+# This covers contains, match, count, and replace
+@pytest.mark.parametrize(
+    "pat", [re.compile("[n-z]"), re.compile("[A-Z]"), re.compile("de"), "A"],
+)
+@pytest.mark.parametrize("repl", ["xyz", "", " "])
+def test_string_compiled_re(ps_gs, pat, repl):
+    ps, gs = ps_gs
+
+    expect = ps.str.contains(pat, regex=True)
+    got = gs.str.contains(pat, regex=True)
+    assert_eq(expect, got)
+
+    expect = ps.str.match(pat)
+    got = gs.str.match(pat)
+    assert_eq(expect, got)
+
+    expect = ps.str.count(pat)
+    got = gs.str.count(pat)
+    assert_eq(expect, got, check_dtype=False)
+
+    expect = ps.str.replace(pat, repl, regex=True)
+    got = gs.str.replace(pat, repl, regex=True)
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "data",
     [
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index cd9e84a2c69..dbff626c363 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core.dtypes import StructDtype
-from cudf.testing._utils import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES, assert_eq
 
 
 @pytest.mark.parametrize(
@@ -282,3 +282,35 @@ def test_struct_field_errors(data):
 
     with pytest.raises(IndexError):
         got.struct.field(100)
+
+
+@pytest.mark.parametrize("dtype", DATETIME_TYPES + TIMEDELTA_TYPES)
+def test_struct_with_datetime_and_timedelta(dtype):
+    df = cudf.DataFrame(
+        {
+            "a": [12, 232, 2334],
+            "datetime": cudf.Series([23432, 3432423, 324324], dtype=dtype),
+        }
+    )
+    series = df.to_struct()
+    a_array = np.array([12, 232, 2334])
+    datetime_array = np.array([23432, 3432423, 324324]).astype(dtype)
+
+    actual = series.to_pandas()
+    values_list = []
+    for i, val in enumerate(a_array):
+        values_list.append({"a": val, "datetime": datetime_array[i]})
+
+    expected = pd.Series(values_list)
+    assert_eq(expected, actual)
+
+
+def test_struct_int_values():
+    series = cudf.Series(
+        [{"a": 1, "b": 2}, {"a": 10, "b": None}, {"a": 5, "b": 6}]
+    )
+    actual_series = series.to_pandas()
+
+    assert isinstance(actual_series[0]["b"], int)
+    assert isinstance(actual_series[1]["b"], type(None))
+    assert isinstance(actual_series[2]["b"], int)
diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py
index d5207c79b86..717b3de8479 100644
--- a/python/cudf/cudf/tests/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/test_subword_tokenizer.py
@@ -32,7 +32,9 @@ def test_subword_tokenize_on_disk_vocab_str_api(datadir):
     the vocabulary is not pre-loaded
     and is accessed via the string accessor
     """
-    with open(os.path.join(datadir, "test_sentences.txt")) as file:
+    with open(
+        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
+    ) as file:
         input_sentence_ls = [line.strip() for line in file]
 
     vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
@@ -55,7 +57,9 @@ def test_subword_tokenize_on_disk_vocab_str_api(datadir):
 def test_subword_tokenize(
     seq_len, stride, add_special_tokens, do_lower_case, datadir
 ):
-    with open(os.path.join(datadir, "test_sentences.txt")) as file:
+    with open(
+        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
+    ) as file:
         input_sentence_ls = [line.strip() for line in file]
 
     vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index 6b81785c879..fcae0a21b6a 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -230,7 +230,7 @@ def test_ngrams(n, separator, expected_values):
 
 
 @pytest.mark.parametrize(
-    "n, expected_values",
+    "n, expected_values, as_list",
     [
         (
             2,
@@ -247,16 +247,22 @@ def test_ngrams(n, separator, expected_values):
                 "er",
                 "re",
             ],
+            False,
+        ),
+        (3, ["thi", "his", "boo", "ook", "her", "ere"], False),
+        (
+            3,
+            [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []],
+            True,
         ),
-        (3, ["thi", "his", "boo", "ook", "her", "ere"]),
     ],
 )
-def test_character_ngrams(n, expected_values):
+def test_character_ngrams(n, expected_values, as_list):
     strings = cudf.Series(["this", "is", "my", "book", "here", ""])
 
     expected = cudf.Series(expected_values)
 
-    actual = strings.str.character_ngrams(n=n)
+    actual = strings.str.character_ngrams(n=n, as_list=as_list)
 
     assert type(expected) == type(actual)
     assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index c0018dae47d..dc126546f15 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -1,51 +1,36 @@
+import math
 import operator
 
-import pandas as pd
+import numpy as np
 import pytest
+from numba import cuda
 
 import cudf
-from cudf.core.udf.pipeline import nulludf
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq
-
-arith_ops = [
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.truediv,
-    operator.floordiv,
-    operator.mod,
-    operator.pow,
-]
-
-comparison_ops = [
-    operator.eq,
-    operator.ne,
-    operator.lt,
-    operator.le,
-    operator.gt,
-    operator.ge,
-]
-
-
-def run_masked_udf_test(func_pdf, func_gdf, data, **kwargs):
+from cudf.core.scalar import NA
+from cudf.core.udf._ops import (
+    arith_ops,
+    bitwise_ops,
+    comparison_ops,
+    unary_ops,
+)
+from cudf.testing._utils import NUMERIC_TYPES, _decimal_series, assert_eq
+
+
+def run_masked_udf_test(func, data, args=(), **kwargs):
     gdf = data
     pdf = data.to_pandas(nullable=True)
 
-    expect = pdf.apply(
-        lambda row: func_pdf(*[row[i] for i in data.columns]), axis=1
-    )
-    obtain = gdf.apply(
-        lambda row: func_gdf(*[row[i] for i in data.columns]), axis=1
-    )
+    expect = pdf.apply(func, args=args, axis=1)
+    obtain = gdf.apply(func, args=args, axis=1)
     assert_eq(expect, obtain, **kwargs)
 
 
-def run_masked_udf_series(func_psr, func_gsr, data, **kwargs):
+def run_masked_udf_series(func, data, args=(), **kwargs):
     gsr = data
     psr = data.to_pandas(nullable=True)
 
-    expect = psr.apply(func_psr)
-    obtain = gsr.apply(func_gsr)
+    expect = psr.apply(func, args=args)
+    obtain = gsr.apply(func, args=args)
     assert_eq(expect, obtain, **kwargs)
 
 
@@ -54,15 +39,73 @@ def test_arith_masked_vs_masked(op):
     # This test should test all the typing
     # and lowering for arithmetic ops between
     # two columns
-    def func_pdf(x, y):
+    def func(row):
+        x = row["a"]
+        y = row["b"]
         return op(x, y)
 
-    @nulludf
-    def func_gdf(x, y):
+    gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
+    run_masked_udf_test(func, gdf, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", bitwise_ops)
+def test_bitwise_masked_vs_masked(op):
+    # This test should test all the typing
+    # and lowering for bitwise ops between
+    # two columns
+    def func(row):
+        x = row["a"]
+        y = row["b"]
         return op(x, y)
 
-    gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 0, 1, 0, 0b1011, 42, None],
+            "b": [1, 1, 0, 0, 0b1100, -42, 5],
+        }
+    )
+    run_masked_udf_test(func, gdf, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "dtype_l",
+    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+)
+@pytest.mark.parametrize(
+    "dtype_r",
+    [
+        "timedelta64[ns]",
+        "timedelta64[us]",
+        "timedelta64[ms]",
+        "timedelta64[s]",
+        "datetime64[ns]",
+        "datetime64[ms]",
+        "datetime64[us]",
+        "datetime64[s]",
+    ],
+)
+@pytest.mark.parametrize("op", [operator.add, operator.sub])
+def test_arith_masked_vs_masked_datelike(op, dtype_l, dtype_r):
+    # Datetime version of the above
+    # does not test all dtype combinations for now
+    if "datetime" in dtype_l and "datetime" in dtype_r and op is operator.add:
+        # don't try adding datetimes to datetimes.
+        pytest.skip("Adding datetime to datetime is not valid")
+
+    def func(row):
+        x = row["a"]
+        y = row["b"]
+        return op(x, y)
+
+    gdf = cudf.DataFrame(
+        {
+            "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA],
+            "b": [4, 5, cudf.NA, cudf.NA],
+        }
+    )
+    gdf["a"] = gdf["a"].astype(dtype_l)
+    gdf["b"] = gdf["b"].astype(dtype_r)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", comparison_ops)
@@ -71,11 +114,9 @@ def test_compare_masked_vs_masked(op):
     # typing and lowering for comparisons
     # between columns
 
-    def func_pdf(x, y):
-        return op(x, y)
-
-    @nulludf
-    def func_gdf(x, y):
+    def func(row):
+        x = row["a"]
+        y = row["b"]
         return op(x, y)
 
     # we should get:
@@ -83,18 +124,15 @@ def func_gdf(x, y):
     gdf = cudf.DataFrame(
         {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]}
     )
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
 @pytest.mark.parametrize("constant", [1, 1.5, True, False])
 @pytest.mark.parametrize("data", [[1, 2, cudf.NA]])
 def test_arith_masked_vs_constant(op, constant, data):
-    def func_pdf(x):
-        return op(x, constant)
-
-    @nulludf
-    def func_gdf(x):
+    def func(row):
+        x = row["data"]
         return op(x, constant)
 
     gdf = cudf.DataFrame({"data": data})
@@ -112,18 +150,15 @@ def func_gdf(x):
         # - pow(x, False) because we have an NA in the series and pandas
         #   insists that (NA**0 == 1) where we do not
         pytest.skip()
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
 @pytest.mark.parametrize("constant", [1, 1.5, True, False])
 @pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]])
 def test_arith_masked_vs_constant_reflected(op, constant, data):
-    def func_pdf(x):
-        return op(constant, x)
-
-    @nulludf
-    def func_gdf(x):
+    def func(row):
+        x = row["data"]
         return op(constant, x)
 
     # Just a single column -> result will be all NA
@@ -135,79 +170,100 @@ def func_gdf(x):
         # - True**NA
         # both due to pandas insisting that this is equal to 1.
         pytest.skip()
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
 @pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]])
 def test_arith_masked_vs_null(op, data):
-    def func_pdf(x):
-        return op(x, pd.NA)
-
-    @nulludf
-    def func_gdf(x):
-        return op(x, cudf.NA)
+    def func(row):
+        x = row["data"]
+        return op(x, NA)
 
     gdf = cudf.DataFrame({"data": data})
 
     if 1 in gdf["data"] and op is operator.pow:
         # In pandas, 1**NA == 1.
         pytest.skip()
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
 def test_arith_masked_vs_null_reflected(op):
-    def func_pdf(x):
-        return op(pd.NA, x)
-
-    @nulludf
-    def func_gdf(x):
-        return op(cudf.NA, x)
+    def func(row):
+        x = row["data"]
+        return op(NA, x)
 
     gdf = cudf.DataFrame({"data": [1, None, 3]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
-def test_masked_is_null_conditional():
-    def func_pdf(x, y):
-        if x is pd.NA:
-            return y
-        else:
-            return x + y
+@pytest.mark.parametrize("op", unary_ops)
+def test_unary_masked(op):
+    # This test should test all the typing
+    # and lowering for unary ops
+
+    def func(row):
+        x = row["a"]
+        return op(x) if x is not NA else NA
+
+    if "log" in op.__name__:
+        gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]})
+    elif op.__name__ in {"asin", "acos"}:
+        gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]})
+    elif op.__name__ in {"atanh"}:
+        gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]})
+    elif op.__name__ in {"acosh", "sqrt", "lgamma"}:
+        gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]})
+    elif op.__name__ in {"gamma"}:
+        gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]})
+    elif op.__name__ in {"invert"}:
+        gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64")
+    else:
+        gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]})
+    run_masked_udf_test(func, gdf, check_dtype=False)
+
 
-    @nulludf
-    def func_gdf(x, y):
-        if x is cudf.NA:
+def test_masked_is_null_conditional():
+    def func(row):
+        x = row["a"]
+        y = row["b"]
+        if x is NA:
             return y
         else:
             return x + y
 
     gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("dtype_a", list(NUMERIC_TYPES))
 @pytest.mark.parametrize("dtype_b", list(NUMERIC_TYPES))
-def test_apply_mixed_dtypes(dtype_a, dtype_b):
+@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq])
+def test_apply_mixed_dtypes(dtype_a, dtype_b, op):
     """
     Test that operations can be performed between columns
     of different dtypes and return a column with the correct
     values and nulls
     """
-    # TODO: Parameterize over the op here
-    def func_pdf(x, y):
-        return x + y
 
-    @nulludf
-    def func_gdf(x, y):
-        return x + y
+    # First perform the op on two dummy data on host, if numpy can
+    # safely type cast, we should expect it to work in udf too.
+    try:
+        op(getattr(np, dtype_a)(0), getattr(np, dtype_b)(42))
+    except TypeError:
+        pytest.skip("Operation is unsupported for corresponding dtype.")
+
+    def func(row):
+        x = row["a"]
+        y = row["b"]
+        return op(x, y)
 
     gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]})
     gdf["a"] = gdf["a"].astype(dtype_a)
     gdf["b"] = gdf["b"].astype(dtype_b)
 
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("val", [5, 5.5])
@@ -218,22 +274,17 @@ def test_apply_return_literal(val):
     to a MaskedType
     """
 
-    def func_pdf(x, y):
-        if x is not pd.NA and x < 2:
-            return val
-        else:
-            return x + y
-
-    @nulludf
-    def func_gdf(x, y):
-        if x is not cudf.NA and x < 2:
+    def func(row):
+        x = row["a"]
+        y = row["b"]
+        if x is not NA and x < 2:
             return val
         else:
             return x + y
 
     gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]})
 
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 def test_apply_return_null():
@@ -241,88 +292,70 @@ def test_apply_return_null():
     Tests casting / unification of Masked and NA
     """
 
-    def func_pdf(x):
-        if x is pd.NA:
-            return pd.NA
-        else:
-            return x
-
-    @nulludf
-    def func_gdf(x):
-        if x is cudf.NA:
-            return cudf.NA
+    def func(row):
+        x = row["a"]
+        if x is NA:
+            return NA
         else:
             return x
 
     gdf = cudf.DataFrame({"a": [1, None, 3]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 def test_apply_return_either_null_or_literal():
-    def func_pdf(x):
+    def func(row):
+        x = row["a"]
         if x > 5:
             return 2
         else:
-            return pd.NA
-
-    @nulludf
-    def func_gdf(x):
-        if x > 5:
-            return 2
-        else:
-            return cudf.NA
+            return NA
 
     gdf = cudf.DataFrame({"a": [1, 3, 6]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 def test_apply_return_literal_only():
-    def func_pdf(x):
-        return 5
-
-    @nulludf
-    def func_gdf(x):
+    def func(x):
         return 5
 
     gdf = cudf.DataFrame({"a": [1, None, 3]})
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 def test_apply_everything():
-    def func_pdf(w, x, y, z):
-        if x is pd.NA:
+    def func(row):
+        w = row["a"]
+        x = row["b"]
+        y = row["c"]
+        z = row["d"]
+        if x is NA:
             return w + y - z
-        elif ((z > y) is not pd.NA) and z > y:
+        elif ((z > y) is not NA) and z > y:
             return x
-        elif ((x + y) is not pd.NA) and x + y == 0:
+        elif ((x + y) is not NA) and x + y == 0:
             return z / x
-        elif x + y is pd.NA:
-            return 2.5
-        else:
-            return y > 2
-
-    @nulludf
-    def func_gdf(w, x, y, z):
-        if x is cudf.NA:
-            return w + y - z
-        elif ((z > y) is not cudf.NA) and z > y:
-            return x
-        elif ((x + y) is not cudf.NA) and x + y == 0:
-            return z / x
-        elif x + y is cudf.NA:
+        elif x + y is NA:
             return 2.5
+        elif w > 100:
+            return (
+                math.sin(x)
+                + math.sqrt(y)
+                - (-z)
+                + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14)
+            )
         else:
             return y > 2
 
     gdf = cudf.DataFrame(
         {
-            "a": [1, 3, 6, 0, None, 5, None],
-            "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0],
-            "c": [2, 3, 6, 0, None, 5, None],
-            "d": [4, None, 6, 0, None, 5, None],
+            "a": [1, 3, 6, 0, None, 5, None, 101],
+            "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0],
+            "c": [2, 3, 6, 0, None, 5, None, 6],
+            "d": [4, None, 6, 0, None, 5, None, 7.5],
         }
     )
-    run_masked_udf_test(func_pdf, func_gdf, gdf, check_dtype=False)
+    run_masked_udf_test(func, gdf, check_dtype=False)
 
 
 ###
@@ -335,25 +368,19 @@ def test_series_apply_basic(data):
     def func(x):
         return x + 1
 
-    run_masked_udf_series(func, func, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
 
 
 def test_series_apply_null_conditional():
-    def func_pdf(x):
-        if x is pd.NA:
-            return 42
-        else:
-            return x - 1
-
-    def func_gdf(x):
-        if x is cudf.NA:
+    def func(x):
+        if x is NA:
             return 42
         else:
             return x - 1
 
     data = cudf.Series([1, cudf.NA, 3])
 
-    run_masked_udf_series(func_pdf, func_gdf, data)
+    run_masked_udf_series(func, data)
 
 
 ###
@@ -365,7 +392,7 @@ def func(x):
         return op(x, x)
 
     data = cudf.Series([1, cudf.NA, 3])
-    run_masked_udf_series(func, func, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", comparison_ops)
@@ -379,7 +406,7 @@ def func(x):
         return op(x, x)
 
     data = cudf.Series([1, cudf.NA, 3])
-    run_masked_udf_series(func, func, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
@@ -393,9 +420,9 @@ def func(x):
     if constant is cudf.NA and op is operator.pow:
         # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
         with pytest.xfail():
-            run_masked_udf_series(func, func, data, check_dtype=False)
+            run_masked_udf_series(func, data, check_dtype=False)
         return
-    run_masked_udf_series(func, func, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
 
 
 @pytest.mark.parametrize("op", arith_ops)
@@ -409,24 +436,160 @@ def func(x):
     if constant is not cudf.NA and constant == 1 and op is operator.pow:
         # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
         with pytest.xfail():
-            run_masked_udf_series(func, func, data, check_dtype=False)
+            run_masked_udf_series(func, data, check_dtype=False)
         return
-    run_masked_udf_series(func, func, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
 
 
 def test_series_masked_is_null_conditional():
-    def func_psr(x):
-        if x is pd.NA:
-            return 42
-        else:
-            return x
-
-    def func_gsr(x):
-        if x is cudf.NA:
+    def func(x):
+        if x is NA:
             return 42
         else:
             return x
 
     data = cudf.Series([1, cudf.NA, 3, cudf.NA])
 
-    run_masked_udf_series(func_psr, func_gsr, data, check_dtype=False)
+    run_masked_udf_series(func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_lambda_support(op):
+    func = lambda row: op(row["a"], row["b"])  # noqa: E731
+
+    data = cudf.DataFrame(
+        {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}
+    )
+
+    run_masked_udf_test(func, data, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_nested_function_support(op):
+    """
+    Nested functions need to be explicitly jitted by the user
+    for numba to recognize them. Unfortunately the object
+    representing the jitted function can not itself be used in
+    pandas udfs.
+    """
+
+    def inner(x, y):
+        return op(x, y)
+
+    def outer(row):
+        x = row["a"]
+        y = row["b"]
+        return inner(x, y)
+
+    gdf = cudf.DataFrame(
+        {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]}
+    )
+
+    with pytest.raises(AttributeError):
+        gdf.apply(outer, axis=1)
+
+    pdf = gdf.to_pandas(nullable=True)
+    inner_gpu = cuda.jit(device=True)(inner)
+
+    def outer_gpu(row):
+        x = row["a"]
+        y = row["b"]
+        return inner_gpu(x, y)
+
+    got = gdf.apply(outer_gpu, axis=1)
+    expect = pdf.apply(outer, axis=1)
+    assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]},
+        {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]},
+        {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]},
+    ],
+)
+def test_masked_udf_subset_selection(data):
+    def func(row):
+        return row["a"] + row["b"]
+
+    data = cudf.DataFrame(data)
+    run_masked_udf_test(func, data)
+
+
+@pytest.mark.parametrize(
+    "unsupported_col",
+    [
+        ["a", "b", "c"],
+        _decimal_series(
+            ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1)
+        ),
+        cudf.Series([1, 2, 3], dtype="category"),
+        cudf.interval_range(start=0, end=3, closed=True),
+        [[1, 2], [3, 4], [5, 6]],
+        [{"a": 1}, {"a": 2}, {"a": 3}],
+    ],
+)
+def test_masked_udf_unsupported_dtype(unsupported_col):
+    data = cudf.DataFrame()
+    data["unsupported_col"] = unsupported_col
+
+    def func(row):
+        return row["unsupported_col"]
+
+    # check that we fail when an unsupported type is used within a function
+    with pytest.raises(TypeError):
+        data.apply(func, axis=1)
+
+    # also check that a DF containing unsupported dtypes can still run a
+    # function that does NOT involve any of the unsupported dtype columns
+    data["supported_col"] = 1
+
+    def other_func(row):
+        return row["supported_col"]
+
+    expect = cudf.Series(np.ones(len(data)))
+    got = data.apply(other_func, axis=1)
+
+    assert_eq(expect, got, check_dtype=False)
+
+
+# tests for `DataFrame.apply(f, args=(x,y,z))`
+# testing the whole space of possibilities is intractable
+# these test the most rudimentary guaranteed functionality
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, cudf.NA, 3]},
+        {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]},
+        {"a": [True, False, cudf.NA]},
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_scalar_args_binops(data, op):
+    data = cudf.DataFrame(data)
+
+    def func(row, c):
+        return op(row["a"], c)
+
+    run_masked_udf_test(func, data, args=(1,), check_dtype=False)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [1, cudf.NA, 3]},
+        {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]},
+        {"a": [True, False, cudf.NA]},
+    ],
+)
+@pytest.mark.parametrize("op", arith_ops + comparison_ops)
+def test_masked_udf_scalar_args_binops_multiple(data, op):
+    data = cudf.DataFrame(data)
+
+    def func(row, c, k):
+        x = op(row["a"], c)
+        y = op(x, k)
+        return y
+
+    run_masked_udf_test(func, data, args=(1, 2), check_dtype=False)
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index c8fb7c1a47d..fa5cde76524 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -332,8 +332,7 @@ def chunk_wise_kernel(nrows, chunks, {args}):
 
 @functools.wraps(_make_row_wise_kernel)
 def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs):
-    """Caching version of ``_make_row_wise_kernel``.
-    """
+    """Caching version of ``_make_row_wise_kernel``."""
     if cache_key is None:
         cache_key = func
     try:
@@ -349,8 +348,7 @@ def _load_cache_or_make_row_wise_kernel(cache_key, func, *args, **kwargs):
 
 @functools.wraps(_make_chunk_wise_kernel)
 def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs):
-    """Caching version of ``_make_row_wise_kernel``.
-    """
+    """Caching version of ``_make_row_wise_kernel``."""
     try:
         return _cache[func]
     except KeyError:
@@ -360,6 +358,5 @@ def _load_cache_or_make_chunk_wise_kernel(func, *args, **kwargs):
 
 
 def _mangle_user(name):
-    """Mangle user variable name
-    """
+    """Mangle user variable name"""
     return "__user_{}".format(name)
diff --git a/python/cudf/cudf/utils/cudautils.py b/python/cudf/cudf/utils/cudautils.py
index 7b7fe674210..5fa091a0081 100755
--- a/python/cudf/cudf/utils/cudautils.py
+++ b/python/cudf/cudf/utils/cudautils.py
@@ -211,6 +211,10 @@ def grouped_window_sizes_from_offset(arr, group_starts, offset):
 
 
 def make_cache_key(udf, sig):
+    """
+    Build a cache key for a user defined function. Used to avoid
+    recompiling the same function for the same set of types
+    """
     codebytes = udf.__code__.co_code
     if udf.__closure__ is not None:
         cvars = tuple([x.cell_contents for x in udf.__closure__])
@@ -252,8 +256,6 @@ def compile_udf(udf, type_signature):
     """
     import cudf.core.udf
 
-    # Check if we've already compiled a similar (but possibly distinct)
-    # function before
     key = make_cache_key(udf, type_signature)
     res = _udf_code_cache.get(key)
     if res:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index c5620bed078..2eb38c0f77e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -121,8 +121,7 @@
 
 
 def np_to_pa_dtype(dtype):
-    """Util to convert numpy dtype to PyArrow dtype.
-    """
+    """Util to convert numpy dtype to PyArrow dtype."""
     # special case when dtype is np.datetime64
     if dtype.kind == "M":
         time_unit, _ = np.datetime_data(dtype)
@@ -153,8 +152,7 @@ def get_numeric_type_info(dtype):
 
 
 def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic
-    """
+    """Cast all args to a common type using numpy promotion logic"""
     dtype = np.result_type(*[a.dtype for a in args])
     return [a.astype(dtype) for a in args]
 
@@ -171,8 +169,8 @@ def _find_common_type_decimal(dtypes):
 
 
 def cudf_dtype_from_pydata_dtype(dtype):
-    """ Given a numpy or pandas dtype, converts it into the equivalent cuDF
-        Python dtype.
+    """Given a numpy or pandas dtype, converts it into the equivalent cuDF
+    Python dtype.
     """
 
     if cudf.api.types.is_categorical_dtype(dtype):
@@ -188,8 +186,8 @@ def cudf_dtype_from_pydata_dtype(dtype):
 
 
 def cudf_dtype_to_pa_type(dtype):
-    """ Given a cudf pandas dtype, converts it into the equivalent cuDF
-        Python dtype.
+    """Given a cudf pandas dtype, converts it into the equivalent cuDF
+    Python dtype.
     """
     if cudf.api.types.is_categorical_dtype(dtype):
         raise NotImplementedError()
@@ -204,8 +202,8 @@ def cudf_dtype_to_pa_type(dtype):
 
 
 def cudf_dtype_from_pa_type(typ):
-    """ Given a cuDF pyarrow dtype, converts it into the equivalent
-        cudf pandas dtype.
+    """Given a cuDF pyarrow dtype, converts it into the equivalent
+    cudf pandas dtype.
     """
     if pa.types.is_list(typ):
         return cudf.core.dtypes.ListDtype.from_arrow(typ)
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index 532fa925670..45004c5f107 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -170,7 +170,7 @@ def _pack_keys_and_values(flattened_hash_table, original_dict):
 
 def _load_vocab_dict(path):
     vocab = {}
-    with open(path, mode="r") as f:
+    with open(path, mode="r", encoding="utf-8") as f:
         counter = 0
         for line in f:
             vocab[line.strip()] = counter
@@ -247,7 +247,7 @@ def hash_vocab(
     sep_token="[SEP]",
 ):
     """
-      Write the vocab vocabulary hashtable to the output_path
+    Write the vocab vocabulary hashtable to the output_path
     """
     np.random.seed(1243342)
     vocab = _load_vocab_dict(vocab_path)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index feae7ccd62d..6746753249c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -357,6 +357,10 @@
 decimal_cols_as_float: list, default None
     If specified, names of the columns that should be converted from
     Decimal to Float64 in the resulting dataframe.
+use_python_file_object : boolean, default True
+    If True, Arrow-backed PythonFile objects will be used in place of fsspec
+    AbstractBufferedFile objects at IO time. This option is likely to improve
+    performance when making small reads from larger ORC files.
 kwargs are passed to the engine
 
 Returns
@@ -397,6 +401,15 @@
     Name of the compression to use. Use None for no compression.
 enable_statistics: boolean, default True
     Enable writing column statistics.
+stripe_size_bytes: integer or None, default None
+    Maximum size of each stripe of the output.
+    If None, 67108864 (64MB) will be used.
+stripe_size_rows: integer or None, default None 1000000
+    Maximum number of rows of each stripe of the output.
+    If None, 1000000 will be used.
+row_index_stride: integer or None, default None 10000
+    Row index stride (maximum number of rows in each row group).
+    If None, 10000 will be used.
 
 
 Notes
@@ -884,6 +897,10 @@
 index_col : int, string or False, default None
     Column to use as the row labels of the DataFrame. Passing `index_col=False`
     explicitly disables index column inference and discards the last column.
+use_python_file_object : boolean, default True
+    If True, Arrow-backed PythonFile objects will be used in place of fsspec
+    AbstractBufferedFile objects at IO time. This option is likely to improve
+    performance when making small reads from larger CSV files.
 
 Returns
 -------
@@ -1098,7 +1115,8 @@ def _is_local_filesystem(fs):
 
 
 def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
-    """Return False if `path_or_data` resolves to multiple filepaths or buffers
+    """Return False if `path_or_data` resolves to multiple filepaths or
+    buffers.
     """
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
@@ -1123,8 +1141,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
 
 
 def is_directory(path_or_data, **kwargs):
-    """Returns True if the provided filepath is a directory
-    """
+    """Returns True if the provided filepath is a directory"""
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
         storage_options = kwargs.get("storage_options")
@@ -1513,7 +1530,6 @@ def _fsspec_data_transfer(
     bytes_per_thread=256_000_000,
     max_gap=64_000,
     mode="rb",
-    clip_local_buffer=False,
     **kwargs,
 ):
 
@@ -1573,14 +1589,6 @@ def _fsspec_data_transfer(
             path_or_fob, byte_ranges, buf, fs=fs, **kwargs,
         )
 
-    if clip_local_buffer:
-        # If we only need the populated byte range
-        # (e.g. a csv byte-range read) then clip parts
-        # of the local buffer that are outside this range
-        start = byte_ranges[0][0]
-        end = byte_ranges[-1][0] + byte_ranges[-1][1]
-        return buf[start:end].tobytes()
-
     return buf.tobytes()
 
 
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index c71a6dbccb1..217466a5a1b 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -5,7 +5,6 @@
 from typing import Any, Dict
 
 import numpy as np
-import six
 from numba import cuda
 
 import cudf
@@ -91,7 +90,7 @@ def query_builder(info, funcid):
     lines = [def_line, "    return {}".format(info["source"])]
     source = "\n".join(lines)
     glbs = {}
-    six.exec_(source, glbs)
+    exec(source, glbs)
     return glbs[funcid]
 
 
@@ -157,8 +156,7 @@ def {kernelname}(out, {args}):
 
 
 def _wrap_query_expr(name, fn, args):
-    """Wrap the query expression in a cuda kernel.
-    """
+    """Wrap the query expression in a cuda kernel."""
 
     def _add_idx(arg):
         if arg.startswith(ENVREF_PREFIX):
@@ -177,7 +175,7 @@ def _add_prefix(arg):
         args=", ".join(kernargs),
         indiced_args=", ".join(indiced_args),
     )
-    six.exec_(src, glbls)
+    exec(src, glbls)
     kernel = glbls[name]
     return kernel
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c9d38c8399e..4f9b23bf6fe 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -162,7 +162,7 @@ def __get__(self, instance, cls):
             return self
         else:
             value = self.func(instance)
-            setattr(instance, self.func.__name__, value)
+            object.__setattr__(instance, self.func.__name__, value)
             return value
 
 
@@ -383,8 +383,8 @@ def _cast_to_appropriate_cudf_type(val, index=None):
 
 def _get_cupy_compatible_args_index(args, ser_index=None):
     """
-     This function returns cupy compatible arguments and output index
-     if conversion is not possible it returns None
+    This function returns cupy compatible arguments and output index
+    if conversion is not possible it returns None
     """
 
     casted_ls = []
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index c081a719808..a8e14504469 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -209,7 +209,7 @@ def run(self):
             os.path.join(CUDF_ROOT, "_deps/dlpack-src/include"),
             os.path.join(
                 os.path.dirname(sysconfig.get_path("include")),
-                "libcudf/libcudacxx",
+                "rapids/libcudacxx",
             ),
             os.path.dirname(sysconfig.get_path("include")),
             np.get_include(),
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index f16b7b42e4e..824babfa10a 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -63,7 +63,7 @@
             os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
             os.path.join(
                 os.path.dirname(sysconfig.get_path("include")),
-                "libcudf/libcudacxx",
+                "rapids/libcudacxx",
             ),
             os.path.dirname(sysconfig.get_path("include")),
             np.get_include(),
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index e897571807b..02a782151db 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -313,11 +313,7 @@ def union_categoricals_cudf(
 
 
 def safe_hash(frame):
-    index = frame.index
-    if isinstance(frame, cudf.DataFrame):
-        return cudf.Series(frame.hash_columns(), index=index)
-    else:
-        return cudf.Series(frame.hash_values(), index=index)
+    return cudf.Series(frame.hash_values(), index=frame.index)
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5040646f9f1..bf063918c89 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -33,7 +33,7 @@
 
 
 class _Frame(dd.core._Frame, OperatorMethodMixin):
-    """ Superclass for DataFrame and Series
+    """Superclass for DataFrame and Series
 
     Parameters
     ----------
@@ -233,6 +233,8 @@ def sort_values(
         max_branch=None,
         divisions=None,
         set_divisions=False,
+        ascending=True,
+        na_position="last",
         **kwargs,
     ):
         if kwargs:
@@ -241,7 +243,9 @@ def sort_values(
             )
 
         if self.npartitions == 1:
-            df = self.map_partitions(M.sort_values, by)
+            df = self.map_partitions(
+                M.sort_values, by, ascending=ascending, na_position=na_position
+            )
         else:
             df = sorting.sort_values(
                 self,
@@ -250,6 +254,8 @@ def sort_values(
                 divisions=divisions,
                 set_divisions=set_divisions,
                 ignore_index=ignore_index,
+                ascending=ascending,
+                na_position=na_position,
             )
 
         if ignore_index:
@@ -257,13 +263,13 @@ def sort_values(
         return df
 
     def to_parquet(self, path, *args, **kwargs):
-        """ Calls dask.dataframe.io.to_parquet with CudfEngine backend """
+        """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
     def to_orc(self, path, **kwargs):
-        """ Calls dask_cudf.io.to_orc """
+        """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
 
         return to_orc(self, path, **kwargs)
@@ -320,8 +326,7 @@ def repartition(self, *args, **kwargs):
         return super().repartition(*args, **kwargs)
 
     def shuffle(self, *args, **kwargs):
-        """ Wraps dask.dataframe DataFrame.shuffle method
-        """
+        """Wraps dask.dataframe DataFrame.shuffle method"""
         shuffle_arg = kwargs.pop("shuffle", None)
         if shuffle_arg and shuffle_arg != "tasks":
             raise ValueError("dask_cudf does not support disk-based shuffle.")
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 0cf9d835523..149d98ebfb9 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -19,6 +19,19 @@
 
 import cudf
 
+SUPPORTED_AGGS = (
+    "count",
+    "mean",
+    "std",
+    "var",
+    "sum",
+    "min",
+    "max",
+    "collect",
+    "first",
+    "last",
+)
+
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
     def __init__(self, *args, **kwargs):
@@ -60,23 +73,24 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )
 
+    def collect(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {c: "collect" for c in self.obj.columns if c not in self.index},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )
+
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {
-            "count",
-            "mean",
-            "std",
-            "var",
-            "sum",
-            "min",
-            "max",
-            "collect",
-            "first",
-            "last",
-        }
         if (
             isinstance(self.obj, DaskDataFrame)
             and (
@@ -86,7 +100,7 @@ def aggregate(self, arg, split_every=None, split_out=1):
                     and all(isinstance(x, str) for x in self.index)
                 )
             )
-            and _is_supported(arg, _supported)
+            and _is_supported(arg, SUPPORTED_AGGS)
         ):
             if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
                 keys = self._meta.grouping.keys.names
@@ -129,33 +143,62 @@ def mean(self, split_every=None, split_out=1):
             as_index=self.as_index,
         )[self._slice]
 
+    def std(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "std"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
+    def var(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "var"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
+    def collect(self, split_every=None, split_out=1):
+        return groupby_agg(
+            self.obj,
+            self.index,
+            {self._slice: "collect"},
+            split_every=split_every,
+            split_out=split_out,
+            dropna=self.dropna,
+            sep=self.sep,
+            sort=self.sort,
+            as_index=self.as_index,
+        )[self._slice]
+
     def aggregate(self, arg, split_every=None, split_out=1):
         if arg == "size":
             return self.size()
         arg = _redirect_aggs(arg)
 
-        _supported = {
-            "count",
-            "mean",
-            "std",
-            "var",
-            "sum",
-            "min",
-            "max",
-            "collect",
-            "first",
-            "last",
-        }
+        if not isinstance(arg, dict):
+            arg = {self._slice: arg}
 
         if (
             isinstance(self.obj, DaskDataFrame)
             and isinstance(self.index, (str, list))
-            and _is_supported({self._slice: arg}, _supported)
+            and _is_supported(arg, SUPPORTED_AGGS)
         ):
             return groupby_agg(
                 self.obj,
                 self.index,
-                {self._slice: arg},
+                arg,
                 split_every=split_every,
                 split_out=split_out,
                 dropna=self.dropna,
@@ -180,42 +223,30 @@ def groupby_agg(
     sort=False,
     as_index=True,
 ):
-    """ Optimized groupby aggregation for Dask-CuDF.
-
-        This aggregation algorithm only supports the following options:
-
-        - "count"
-        - "mean"
-        - "std"
-        - "var"
-        - "sum"
-        - "min"
-        - "max"
-        - "collect"
-        - "first"
-        - "last"
-
-        This "optimized" approach is more performant than the algorithm
-        in `dask.dataframe`, because it allows the cudf backend to
-        perform multiple aggregations at once.
+    """Optimized groupby aggregation for Dask-CuDF.
+
+    This aggregation algorithm only supports the following options:
+
+    - "count"
+    - "mean"
+    - "std"
+    - "var"
+    - "sum"
+    - "min"
+    - "max"
+    - "collect"
+    - "first"
+    - "last"
+
+    This "optimized" approach is more performant than the algorithm
+    in `dask.dataframe`, because it allows the cudf backend to
+    perform multiple aggregations at once.
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
-    _supported = {
-        "count",
-        "mean",
-        "std",
-        "var",
-        "sum",
-        "min",
-        "max",
-        "collect",
-        "first",
-        "last",
-    }
-    if not _is_supported(aggs, _supported):
+    if not _is_supported(aggs, SUPPORTED_AGGS):
         raise ValueError(
-            f"Supported aggs include {_supported} for groupby_agg API. "
+            f"Supported aggs include {SUPPORTED_AGGS} for groupby_agg API. "
             f"Aggregations must be specified with dict or list syntax."
         )
 
@@ -348,8 +379,7 @@ def groupby_agg(
 
 
 def _redirect_aggs(arg):
-    """ Redirect aggregations to their corresponding name in cuDF
-    """
+    """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
         sum: "sum",
         max: "max",
@@ -375,8 +405,7 @@ def _redirect_aggs(arg):
 
 
 def _is_supported(arg, supported: set):
-    """ Check that aggregations in `arg` are a subset of `supported`
-    """
+    """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
             _global_set: Set[str] = set()
@@ -395,8 +424,7 @@ def _is_supported(arg, supported: set):
 
 
 def _make_name(*args, sep="_"):
-    """ Combine elements of `args` into a new string
-    """
+    """Combine elements of `args` into a new string"""
     _args = (arg for arg in args if arg != "")
     return sep.join(_args)
 
@@ -404,15 +432,15 @@ def _make_name(*args, sep="_"):
 def _groupby_partition_agg(
     df, gb_cols, aggs, columns, split_out, dropna, sort, sep
 ):
-    """ Initial partition-level aggregation task.
-
-        This is the first operation to be executed on each input
-        partition in `groupby_agg`.  Depending on `aggs`, four possible
-        groupby aggregations ("count", "sum", "min", and "max") are
-        performed.  The result is then partitioned (by hashing `gb_cols`)
-        into a number of distinct dictionary elements.  The number of
-        elements in the output dictionary (`split_out`) corresponds to
-        the number of partitions in the final output of `groupby_agg`.
+    """Initial partition-level aggregation task.
+
+    This is the first operation to be executed on each input
+    partition in `groupby_agg`.  Depending on `aggs`, four possible
+    groupby aggregations ("count", "sum", "min", and "max") are
+    performed.  The result is then partitioned (by hashing `gb_cols`)
+    into a number of distinct dictionary elements.  The number of
+    elements in the output dictionary (`split_out`) corresponds to
+    the number of partitions in the final output of `groupby_agg`.
     """
 
     # Modify dict for initial (partition-wise) aggregations
@@ -459,15 +487,15 @@ def _groupby_partition_agg(
 
 
 def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
-    """ Node in groupby-aggregation reduction tree.
-
-        Following the initial `_groupby_partition_agg` tasks,
-        the `groupby_agg` algorithm will perform a tree reduction
-        to combine the data from the input partitions into
-        `split_out` different output partitions.  For each node in
-        the reduction tree, the input DataFrame objects are
-        concatenated, and "sum", "min" and/or "max" groupby
-        aggregations are used to combine the necessary statistics.
+    """Node in groupby-aggregation reduction tree.
+
+    Following the initial `_groupby_partition_agg` tasks,
+    the `groupby_agg` algorithm will perform a tree reduction
+    to combine the data from the input partitions into
+    `split_out` different output partitions.  For each node in
+    the reduction tree, the input DataFrame objects are
+    concatenated, and "sum", "min" and/or "max" groupby
+    aggregations are used to combine the necessary statistics.
     """
 
     df = _concat(dfs, ignore_index=True)
@@ -478,7 +506,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in ("min", "max", "collect"):
+        elif agg in SUPPORTED_AGGS:
             agg_dict[col] = [agg]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
@@ -493,8 +521,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep):
 
 
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
-    """ Calculate variance (given count, sum, and sum-squared columns).
-    """
+    """Calculate variance (given count, sum, and sum-squared columns)."""
 
     # Select count, sum, and sum-squared
     n = df[count_name]
@@ -526,13 +553,13 @@ def _finalize_gb_agg(
     str_cols_out,
     aggs_renames,
 ):
-    """ Final aggregation task.
+    """Final aggregation task.
 
-        This is the final operation on each output partitions
-        of the `groupby_agg` algorithm.  This function must
-        take care of higher-order aggregations, like "mean",
-        "std" and "var".  We also need to deal with the column
-        index, the row index, and final sorting behavior.
+    This is the final operation on each output partitions
+    of the `groupby_agg` algorithm.  This function must
+    take care of higher-order aggregations, like "mean",
+    "std" and "var".  We also need to deal with the column
+    index, the row index, and final sorting behavior.
     """
 
     # Deal with higher-order aggregations
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 4dc803b3259..132201a349e 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -111,6 +111,7 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
         return read_csv_without_chunksize(path, **kwargs)
 
     dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
+    usecols = kwargs.pop("usecols", None)
     meta = dask_reader(filenames[0], **kwargs)._meta
 
     dsk = {}
@@ -130,11 +131,14 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
                     "names"
                 ] = meta.columns  # no header in the middle of the file
                 kwargs2["header"] = None
+            kwargs2["usecols"] = usecols
             dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
 
             i += 1
 
     divisions = [None] * (len(dsk) + 1)
+    if usecols is not None:
+        meta = meta[usecols]
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index c82ed5bc375..2e5d55e92d2 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -5,7 +5,6 @@
 from io import BufferedWriter, BytesIO, IOBase
 
 import numpy as np
-import pyarrow as pa
 from pyarrow import dataset as pa_ds, parquet as pq
 
 from dask import dataframe as dd
@@ -229,9 +228,8 @@ def read_partition(
 
         if index and (index[0] in df.columns):
             df = df.set_index(index[0])
-        elif index is False and set(df.index.names).issubset(columns):
-            # If index=False, we need to make sure all of the
-            # names in `columns` are actually in `df.columns`
+        elif index is False and df.index.names != (None,):
+            # If index=False, we shouldn't have a named index
             df.reset_index(inplace=True)
 
         return df
@@ -332,13 +330,18 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            typ = schema.field(col_name).type
+            if col_name is None:
+                # Pyarrow cannot handle `None` as a field name.
+                # However, this should be a simple range index that
+                # we can ignore anyway
+                continue
+            typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
             if (
                 col_name in schema.names
-                and not isinstance(typ, (pa.ListType, pa.StructType))
+                and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype))
                 and isinstance(col, cudf.core.column.StringColumn)
             ):
-                df._data[col_name] = col.astype(cudf_dtype_from_pa_type(typ))
+                df._data[col_name] = col.astype(typ)
 
 
 def read_parquet(
@@ -348,7 +351,7 @@ def read_parquet(
     row_groups_per_part=None,
     **kwargs,
 ):
-    """ Read parquet files into a Dask DataFrame
+    """Read parquet files into a Dask DataFrame
 
     Calls ``dask.dataframe.read_parquet`` to cordinate the execution of
     ``cudf.read_parquet``, and ultimately read multiple partitions into
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index db1c47c8819..98061f6c624 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -134,3 +134,19 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
     df.to_csv(path, index=False, compression=compression)
     df2 = dask_cudf.read_csv(path, chunksize=None, dtype=typ)
     dd.assert_eq(df, df2)
+
+
+def test_csv_reader_usecols(tmp_path):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4] * 100,
+            "b": ["a", "b", "c", "d"] * 100,
+            "c": [10, 11, 12, 13] * 100,
+        }
+    )
+    csv_path = str(tmp_path / "usecols_data.csv")
+    df.to_csv(csv_path, index=False)
+    ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
+    ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
+
+    dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 4c263ca2e53..d93037b3802 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -86,6 +86,17 @@ def test_roundtrip_from_dask_index_false(tmpdir):
     dd.assert_eq(ddf.reset_index(drop=False), ddf2)
 
 
+def test_roundtrip_from_dask_none_index_false(tmpdir):
+    tmpdir = str(tmpdir)
+    path = os.path.join(tmpdir, "test.parquet")
+
+    df2 = ddf.reset_index(drop=True).compute()
+    df2.to_parquet(path, engine="pyarrow")
+
+    ddf3 = dask_cudf.read_parquet(path, index=False)
+    dd.assert_eq(df2, ddf3)
+
+
 @pytest.mark.parametrize("write_meta", [True, False])
 def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     tmpdir = str(tmpdir)
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 9949016d6a7..5f2af445170 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -22,11 +22,19 @@ def set_index_post(df, index_name, drop, column_dtype):
     return df2
 
 
-def _set_partitions_pre(s, divisions):
-    partitions = divisions.searchsorted(s, side="right") - 1
-    partitions[
-        divisions.tail(1).searchsorted(s, side="right").astype("bool")
-    ] = (len(divisions) - 2)
+def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
+    if ascending:
+        partitions = divisions.searchsorted(s, side="right") - 1
+    else:
+        partitions = (
+            len(divisions) - divisions.searchsorted(s, side="right") - 1
+        )
+    partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = (
+        0 if ascending else (len(divisions) - 2)
+    )
+    partitions[s._columns[0].isnull().values] = (
+        len(divisions) - 2 if na_position == "last" else 0
+    )
     return partitions
 
 
@@ -38,7 +46,7 @@ def _quantile(a, q):
 
 
 def merge_quantiles(finalq, qs, vals):
-    """ Combine several quantile calculations of different data.
+    """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
     """
     if isinstance(finalq, Iterator):
@@ -201,7 +209,7 @@ def quantile_divisions(df, by, npartitions):
                 divisions[col].iloc[-1] = chr(
                     ord(divisions[col].iloc[-1][0]) + 1
                 )
-        divisions = divisions.drop_duplicates()
+        divisions = divisions.drop_duplicates().sort_index()
     return divisions
 
 
@@ -212,9 +220,13 @@ def sort_values(
     divisions=None,
     set_divisions=False,
     ignore_index=False,
+    ascending=True,
+    na_position="last",
 ):
-    """ Sort by the given list/tuple of column names.
-    """
+    """Sort by the given list/tuple of column names."""
+    if na_position not in ("first", "last"):
+        raise ValueError("na_position must be either 'first' or 'last'")
+
     npartitions = df.npartitions
     if isinstance(by, tuple):
         by = list(by)
@@ -232,7 +244,11 @@ def sort_values(
         divisions = df._meta._constructor_sliced(divisions, dtype=dtype)
 
     partitions = df[by].map_partitions(
-        _set_partitions_pre, divisions=divisions, meta=meta
+        _set_partitions_pre,
+        divisions=divisions,
+        ascending=ascending,
+        na_position=na_position,
+        meta=meta,
     )
 
     df2 = df.assign(_partitions=partitions)
@@ -247,8 +263,11 @@ def sort_values(
     df3.divisions = (None,) * (df3.npartitions + 1)
 
     # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(M.sort_values, by)
+    df4 = df3.map_partitions(
+        M.sort_values, by, ascending=ascending, na_position=na_position
+    )
     if not isinstance(divisions, gd.DataFrame) and set_divisions:
         # Can't have multi-column divisions elsewhere in dask (yet)
         df4.divisions = methods.tolist(divisions)
+
     return df4
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 6569ffa94c5..fce9b773dac 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -11,11 +11,12 @@
 from cudf.core._compat import PANDAS_GE_120
 
 import dask_cudf
-from dask_cudf.groupby import _is_supported
+from dask_cudf.groupby import SUPPORTED_AGGS, _is_supported
 
 
-@pytest.mark.parametrize("aggregation", ["sum", "mean", "count", "min", "max"])
-def test_groupby_basic_aggs(aggregation):
+@pytest.mark.parametrize("aggregation", SUPPORTED_AGGS)
+@pytest.mark.parametrize("series", [False, True])
+def test_groupby_basic(series, aggregation):
     pdf = pd.DataFrame(
         {
             "x": np.random.randint(0, 5, size=10000),
@@ -24,19 +25,23 @@ def test_groupby_basic_aggs(aggregation):
     )
 
     gdf = cudf.DataFrame.from_pandas(pdf)
+    gdf_grouped = gdf.groupby("x")
+    ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("x")
 
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
+    if series:
+        gdf_grouped = gdf_grouped.x
+        ddf_grouped = ddf_grouped.x
 
-    a = getattr(gdf.groupby("x"), aggregation)()
-    b = getattr(ddf.groupby("x"), aggregation)().compute()
+    a = getattr(gdf_grouped, aggregation)()
+    b = getattr(ddf_grouped, aggregation)().compute()
 
     if aggregation == "count":
         dd.assert_eq(a, b, check_dtype=False)
     else:
         dd.assert_eq(a, b)
 
-    a = gdf.groupby("x").agg({"x": aggregation})
-    b = ddf.groupby("x").agg({"x": aggregation}).compute()
+    a = gdf_grouped.agg({"x": aggregation})
+    b = ddf_grouped.agg({"x": aggregation}).compute()
 
     if aggregation == "count":
         dd.assert_eq(a, b, check_dtype=False)
@@ -117,31 +122,6 @@ def test_groupby_std(func):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda df: df.groupby("x").agg({"y": "collect"}),
-        lambda df: df.groupby("x").y.agg("collect"),
-    ],
-)
-def test_groupby_collect(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
-    gdf = cudf.DataFrame.from_pandas(pdf)
-
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
-
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
-
-    dd.assert_eq(a, b)
-
-
 # reason gotattr in cudf
 @pytest.mark.parametrize(
     "func",
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9d113cf2104..f4ae83245cb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,3 +1,4 @@
+import cupy as cp
 import numpy as np
 import pytest
 
@@ -7,13 +8,13 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.sorting import quantile_divisions
 
 
+@pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", "c", "d", ["a", "b"], ["c", "d"]])
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
-def test_sort_values(nelem, nparts, by):
+def test_sort_values(nelem, nparts, by, ascending):
     np.random.seed(0)
     df = cudf.DataFrame()
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
@@ -23,13 +24,14 @@ def test_sort_values(nelem, nparts, by):
     ddf = dd.from_pandas(df, npartitions=nparts)
 
     with dask.config.set(scheduler="single-threaded"):
-        got = ddf.sort_values(by=by)
-    expect = df.sort_values(by=by)
+        got = ddf.sort_values(by=by, ascending=ascending)
+    expect = df.sort_values(by=by, ascending=ascending)
     dd.assert_eq(got, expect, check_index=False)
 
 
+@pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
-def test_sort_values_single_partition(by):
+def test_sort_values_single_partition(by, ascending):
     df = cudf.DataFrame()
     nelem = 1000
     df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1])
@@ -37,8 +39,8 @@ def test_sort_values_single_partition(by):
     ddf = dd.from_pandas(df, npartitions=1)
 
     with dask.config.set(scheduler="single-threaded"):
-        got = ddf.sort_values(by=by)
-    expect = df.sort_values(by=by)
+        got = ddf.sort_values(by=by, ascending=ascending)
+    expect = df.sort_values(by=by, ascending=ascending)
     dd.assert_eq(got, expect)
 
 
@@ -52,24 +54,32 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
+@pytest.mark.parametrize("na_position", ["first", "last"])
+@pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
-def test_sort_values_with_nulls(by):
-    df = cudf.DataFrame(
+@pytest.mark.parametrize(
+    "data",
+    [
         {
-            "a": list(range(50)) + [None] * 50 + list(range(50, 100)),
-            "b": [None] * 100 + list(range(100, 150)),
-        }
-    )
-    ddf = dd.from_pandas(df, npartitions=10)
-
-    # assert that quantile divisions of dataframe contains nulls
-    divisions = quantile_divisions(ddf, by, ddf.npartitions)
-    if isinstance(divisions, list):
-        assert None in divisions
-    else:
-        assert all([divisions[col].has_nulls for col in by])
-
-    got = ddf.sort_values(by=by)
-    expect = df.sort_values(by=by)
+            "a": [None] * 100 + list(range(100, 150)),
+            "b": list(range(50)) + [None] * 50 + list(range(50, 100)),
+        },
+        {"a": list(range(15)) + [None] * 5, "b": list(reversed(range(20)))},
+    ],
+)
+def test_sort_values_with_nulls(data, by, ascending, na_position):
+    np.random.seed(0)
+    cp.random.seed(0)
+    df = cudf.DataFrame(data)
+    ddf = dd.from_pandas(df, npartitions=5)
 
-    dd.assert_eq(got, expect)
+    with dask.config.set(scheduler="single-threaded"):
+        got = ddf.sort_values(
+            by=by, ascending=ascending, na_position=na_position
+        )
+        expect = df.sort_values(
+            by=by, ascending=ascending, na_position=na_position
+        )
+
+    # cudf ordering for nulls is non-deterministic
+    dd.assert_eq(got[by], expect[by], check_index=False)